Merge r1476010 through r1476452 from trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-2802@1476453 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2013-04-26 23:50:17 +00:00
commit 60341dae19
76 changed files with 1721 additions and 212 deletions

View File

@ -446,6 +446,28 @@ public class NetworkTopology {
return getNode(node.getNetworkLocation()); return getNode(node.getNetworkLocation());
} }
/**
* Given a string representation of a rack, return its children
* @param loc a path-like string representation of a rack
* @return a newly allocated list with all the node's children
*/
public List<Node> getDatanodesInRack(String loc) {
netlock.readLock().lock();
try {
loc = NodeBase.normalize(loc);
if (!NodeBase.ROOT.equals(loc)) {
loc = loc.substring(1);
}
InnerNode rack = (InnerNode) clusterMap.getLoc(loc);
if (rack == null) {
return null;
}
return new ArrayList<Node>(rack.getChildren());
} finally {
netlock.readLock().unlock();
}
}
/** Remove a node /** Remove a node
* Update node counter and rack counter if necessary * Update node counter and rack counter if necessary
* @param node node to be removed; can be null * @param node node to be removed; can be null

View File

@ -22,6 +22,9 @@ Trunk (Unreleased)
Azure environments. (See breakdown of tasks below for subtasks and Azure environments. (See breakdown of tasks below for subtasks and
contributors) contributors)
HDFS-2576. Enhances the DistributedFileSystem's create API so that clients
can specify favored datanodes for a file's blocks. (ddas)
IMPROVEMENTS IMPROVEMENTS
HDFS-4665. Move TestNetworkTopologyWithNodeGroup to common. HDFS-4665. Move TestNetworkTopologyWithNodeGroup to common.
@ -262,6 +265,9 @@ Trunk (Unreleased)
HDFS-4757. Update FSDirectory#inodeMap when replacing an INodeDirectory HDFS-4757. Update FSDirectory#inodeMap when replacing an INodeDirectory
while setting quota. (Jing Zhao via szetszwo) while setting quota. (Jing Zhao via szetszwo)
HDFS-4761. When resetting FSDirectory, the inodeMap should also be reset.
(Jing Zhao via szetszwo)
BREAKDOWN OF HADOOP-8562 SUBTASKS AND RELATED JIRAS BREAKDOWN OF HADOOP-8562 SUBTASKS AND RELATED JIRAS
HDFS-4145. Merge hdfs cmd line scripts from branch-1-win. (David Lao, HDFS-4145. Merge hdfs cmd line scripts from branch-1-win. (David Lao,
@ -457,6 +463,10 @@ Release 2.0.5-beta - UNRELEASED
HDFS-4346. Add SequentialNumber as a base class for INodeId and HDFS-4346. Add SequentialNumber as a base class for INodeId and
GenerationStamp. (szetszwo) GenerationStamp. (szetszwo)
HDFS-4721. Speed up lease recovery by avoiding stale datanodes and choosing
the datanode with the most recent heartbeat as the primary. (Varun Sharma
via szetszwo)
OPTIMIZATIONS OPTIMIZATIONS
BUG FIXES BUG FIXES

View File

@ -1210,7 +1210,7 @@ public class DFSClient implements java.io.Closeable {
ChecksumOpt checksumOpt) ChecksumOpt checksumOpt)
throws IOException { throws IOException {
return create(src, permission, flag, true, return create(src, permission, flag, true,
replication, blockSize, progress, buffersize, checksumOpt); replication, blockSize, progress, buffersize, checksumOpt, null);
} }
/** /**
@ -1244,6 +1244,29 @@ public class DFSClient implements java.io.Closeable {
Progressable progress, Progressable progress,
int buffersize, int buffersize,
ChecksumOpt checksumOpt) throws IOException { ChecksumOpt checksumOpt) throws IOException {
return create(src, permission, flag, createParent, replication, blockSize,
progress, buffersize, checksumOpt, null);
}
/**
* Same as {@link #create(String, FsPermission, EnumSet, boolean, short, long,
* Progressable, int, ChecksumOpt)} with the addition of favoredNodes that is
* a hint to where the namenode should place the file blocks.
* The favored nodes hint is not persisted in HDFS. Hence it may be honored
* at the creation time only. HDFS could move the blocks during balancing or
* replication, to move the blocks from favored nodes. A value of null means
* no favored nodes for this create
*/
public DFSOutputStream create(String src,
FsPermission permission,
EnumSet<CreateFlag> flag,
boolean createParent,
short replication,
long blockSize,
Progressable progress,
int buffersize,
ChecksumOpt checksumOpt,
InetSocketAddress[] favoredNodes) throws IOException {
checkOpen(); checkOpen();
if (permission == null) { if (permission == null) {
permission = FsPermission.getFileDefault(); permission = FsPermission.getFileDefault();
@ -1252,9 +1275,18 @@ public class DFSClient implements java.io.Closeable {
if(LOG.isDebugEnabled()) { if(LOG.isDebugEnabled()) {
LOG.debug(src + ": masked=" + masked); LOG.debug(src + ": masked=" + masked);
} }
String[] favoredNodeStrs = null;
if (favoredNodes != null) {
favoredNodeStrs = new String[favoredNodes.length];
for (int i = 0; i < favoredNodes.length; i++) {
favoredNodeStrs[i] =
favoredNodes[i].getAddress().getHostAddress() + ":"
+ favoredNodes[i].getPort();
}
}
final DFSOutputStream result = DFSOutputStream.newStreamForCreate(this, final DFSOutputStream result = DFSOutputStream.newStreamForCreate(this,
src, masked, flag, createParent, replication, blockSize, progress, src, masked, flag, createParent, replication, blockSize, progress,
buffersize, dfsClientConf.createChecksum(checksumOpt)); buffersize, dfsClientConf.createChecksum(checksumOpt), favoredNodeStrs);
beginFileLease(src, result); beginFileLease(src, result);
return result; return result;
} }

View File

@ -315,6 +315,7 @@ public class DFSOutputStream extends FSOutputSummer implements Syncable {
return key; return key;
} }
}); });
private String[] favoredNodes;
volatile boolean hasError = false; volatile boolean hasError = false;
volatile int errorIndex = -1; volatile int errorIndex = -1;
private BlockConstructionStage stage; // block construction stage private BlockConstructionStage stage; // block construction stage
@ -391,7 +392,11 @@ public class DFSOutputStream extends FSOutputSummer implements Syncable {
} }
} }
private void setFavoredNodes(String[] favoredNodes) {
this.favoredNodes = favoredNodes;
}
/** /**
* Initialize for data streaming * Initialize for data streaming
*/ */
@ -1177,7 +1182,7 @@ public class DFSOutputStream extends FSOutputSummer implements Syncable {
while (true) { while (true) {
try { try {
return dfsClient.namenode.addBlock(src, dfsClient.clientName, return dfsClient.namenode.addBlock(src, dfsClient.clientName,
block, excludedNodes, fileId); block, excludedNodes, fileId, favoredNodes);
} catch (RemoteException e) { } catch (RemoteException e) {
IOException ue = IOException ue =
e.unwrapRemoteException(FileNotFoundException.class, e.unwrapRemoteException(FileNotFoundException.class,
@ -1318,7 +1323,7 @@ public class DFSOutputStream extends FSOutputSummer implements Syncable {
/** Construct a new output stream for creating a file. */ /** Construct a new output stream for creating a file. */
private DFSOutputStream(DFSClient dfsClient, String src, HdfsFileStatus stat, private DFSOutputStream(DFSClient dfsClient, String src, HdfsFileStatus stat,
EnumSet<CreateFlag> flag, Progressable progress, EnumSet<CreateFlag> flag, Progressable progress,
DataChecksum checksum) throws IOException { DataChecksum checksum, String[] favoredNodes) throws IOException {
this(dfsClient, src, progress, stat, checksum); this(dfsClient, src, progress, stat, checksum);
this.shouldSyncBlock = flag.contains(CreateFlag.SYNC_BLOCK); this.shouldSyncBlock = flag.contains(CreateFlag.SYNC_BLOCK);
@ -1326,12 +1331,15 @@ public class DFSOutputStream extends FSOutputSummer implements Syncable {
checksum.getBytesPerChecksum()); checksum.getBytesPerChecksum());
streamer = new DataStreamer(); streamer = new DataStreamer();
if (favoredNodes != null && favoredNodes.length != 0) {
streamer.setFavoredNodes(favoredNodes);
}
} }
static DFSOutputStream newStreamForCreate(DFSClient dfsClient, String src, static DFSOutputStream newStreamForCreate(DFSClient dfsClient, String src,
FsPermission masked, EnumSet<CreateFlag> flag, boolean createParent, FsPermission masked, EnumSet<CreateFlag> flag, boolean createParent,
short replication, long blockSize, Progressable progress, int buffersize, short replication, long blockSize, Progressable progress, int buffersize,
DataChecksum checksum) throws IOException { DataChecksum checksum, String[] favoredNodes) throws IOException {
final HdfsFileStatus stat; final HdfsFileStatus stat;
try { try {
stat = dfsClient.namenode.create(src, masked, dfsClient.clientName, stat = dfsClient.namenode.create(src, masked, dfsClient.clientName,
@ -1349,11 +1357,19 @@ public class DFSOutputStream extends FSOutputSummer implements Syncable {
SnapshotAccessControlException.class); SnapshotAccessControlException.class);
} }
final DFSOutputStream out = new DFSOutputStream(dfsClient, src, stat, final DFSOutputStream out = new DFSOutputStream(dfsClient, src, stat,
flag, progress, checksum); flag, progress, checksum, favoredNodes);
out.start(); out.start();
return out; return out;
} }
static DFSOutputStream newStreamForCreate(DFSClient dfsClient, String src,
FsPermission masked, EnumSet<CreateFlag> flag, boolean createParent,
short replication, long blockSize, Progressable progress, int buffersize,
DataChecksum checksum) throws IOException {
return newStreamForCreate(dfsClient, src, masked, flag, createParent, replication,
blockSize, progress, buffersize, checksum, null);
}
/** Construct a new output stream for append. */ /** Construct a new output stream for append. */
private DFSOutputStream(DFSClient dfsClient, String src, private DFSOutputStream(DFSClient dfsClient, String src,
Progressable progress, LocatedBlock lastBlock, HdfsFileStatus stat, Progressable progress, LocatedBlock lastBlock, HdfsFileStatus stat,

View File

@ -268,6 +268,27 @@ public class DistributedFileSystem extends FileSystem {
: EnumSet.of(CreateFlag.CREATE), bufferSize, replication, : EnumSet.of(CreateFlag.CREATE), bufferSize, replication,
blockSize, progress, null); blockSize, progress, null);
} }
/**
* Same as
* {@link #create(Path, FsPermission, boolean, int, short, long,
* Progressable)} with the addition of favoredNodes that is a hint to
* where the namenode should place the file blocks.
* The favored nodes hint is not persisted in HDFS. Hence it may be honored
* at the creation time only. HDFS could move the blocks during balancing or
* replication, to move the blocks from favored nodes. A value of null means
* no favored nodes for this create
*/
public HdfsDataOutputStream create(Path f, FsPermission permission,
boolean overwrite, int bufferSize, short replication, long blockSize,
Progressable progress, InetSocketAddress[] favoredNodes) throws IOException {
statistics.incrementWriteOps(1);
final DFSOutputStream out = dfs.create(getPathName(f), permission,
overwrite ? EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE)
: EnumSet.of(CreateFlag.CREATE),
true, replication, blockSize, progress, bufferSize, null, favoredNodes);
return new HdfsDataOutputStream(out, statistics);
}
@Override @Override
public HdfsDataOutputStream create(Path f, FsPermission permission, public HdfsDataOutputStream create(Path f, FsPermission permission,

View File

@ -306,6 +306,8 @@ public interface ClientProtocol {
* @param excludeNodes a list of nodes that should not be * @param excludeNodes a list of nodes that should not be
* allocated for the current block * allocated for the current block
* @param fileId the id uniquely identifying a file * @param fileId the id uniquely identifying a file
* @param favoredNodes the list of nodes where the client wants the blocks.
* Nodes are identified by either host name or address.
* *
* @return LocatedBlock allocated block information. * @return LocatedBlock allocated block information.
* *
@ -320,7 +322,8 @@ public interface ClientProtocol {
*/ */
@Idempotent @Idempotent
public LocatedBlock addBlock(String src, String clientName, public LocatedBlock addBlock(String src, String clientName,
ExtendedBlock previous, DatanodeInfo[] excludeNodes, long fileId) ExtendedBlock previous, DatanodeInfo[] excludeNodes, long fileId,
String[] favoredNodes)
throws AccessControlException, FileNotFoundException, throws AccessControlException, FileNotFoundException,
NotReplicatedYetException, SafeModeException, UnresolvedLinkException, NotReplicatedYetException, SafeModeException, UnresolvedLinkException,
IOException; IOException;

View File

@ -382,12 +382,15 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
try { try {
List<DatanodeInfoProto> excl = req.getExcludeNodesList(); List<DatanodeInfoProto> excl = req.getExcludeNodesList();
List<String> favor = req.getFavoredNodesList();
LocatedBlock result = server.addBlock( LocatedBlock result = server.addBlock(
req.getSrc(), req.getSrc(),
req.getClientName(), req.getClientName(),
req.hasPrevious() ? PBHelper.convert(req.getPrevious()) : null, req.hasPrevious() ? PBHelper.convert(req.getPrevious()) : null,
(excl == null || excl.size() == 0) ? null : PBHelper.convert(excl (excl == null || excl.size() == 0) ? null : PBHelper.convert(excl
.toArray(new DatanodeInfoProto[excl.size()])), req.getFileId()); .toArray(new DatanodeInfoProto[excl.size()])), req.getFileId(),
(favor == null || favor.size() == 0) ? null : favor
.toArray(new String[favor.size()]));
return AddBlockResponseProto.newBuilder() return AddBlockResponseProto.newBuilder()
.setBlock(PBHelper.convert(result)).build(); .setBlock(PBHelper.convert(result)).build();
} catch (IOException e) { } catch (IOException e) {

View File

@ -312,7 +312,8 @@ public class ClientNamenodeProtocolTranslatorPB implements
@Override @Override
public LocatedBlock addBlock(String src, String clientName, public LocatedBlock addBlock(String src, String clientName,
ExtendedBlock previous, DatanodeInfo[] excludeNodes, long fileId) ExtendedBlock previous, DatanodeInfo[] excludeNodes, long fileId,
String[] favoredNodes)
throws AccessControlException, FileNotFoundException, throws AccessControlException, FileNotFoundException,
NotReplicatedYetException, SafeModeException, UnresolvedLinkException, NotReplicatedYetException, SafeModeException, UnresolvedLinkException,
IOException { IOException {
@ -322,6 +323,9 @@ public class ClientNamenodeProtocolTranslatorPB implements
req.setPrevious(PBHelper.convert(previous)); req.setPrevious(PBHelper.convert(previous));
if (excludeNodes != null) if (excludeNodes != null)
req.addAllExcludeNodes(PBHelper.convert(excludeNodes)); req.addAllExcludeNodes(PBHelper.convert(excludeNodes));
if (favoredNodes != null) {
req.addAllFavoredNodes(Arrays.asList(favoredNodes));
}
try { try {
return PBHelper.convert(rpcProxy.addBlock(null, req.build()).getBlock()); return PBHelper.convert(rpcProxy.addBlock(null, req.build()).getBlock());
} catch (ServiceException e) { } catch (ServiceException e) {

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -41,7 +42,10 @@ public class BlockInfoUnderConstruction extends BlockInfo {
*/ */
private List<ReplicaUnderConstruction> replicas; private List<ReplicaUnderConstruction> replicas;
/** A data-node responsible for block recovery. */ /**
* Index of the primary data node doing the recovery. Useful for log
* messages.
*/
private int primaryNodeIndex = -1; private int primaryNodeIndex = -1;
/** /**
@ -62,6 +66,7 @@ public class BlockInfoUnderConstruction extends BlockInfo {
static class ReplicaUnderConstruction extends Block { static class ReplicaUnderConstruction extends Block {
private DatanodeDescriptor expectedLocation; private DatanodeDescriptor expectedLocation;
private ReplicaState state; private ReplicaState state;
private boolean chosenAsPrimary;
ReplicaUnderConstruction(Block block, ReplicaUnderConstruction(Block block,
DatanodeDescriptor target, DatanodeDescriptor target,
@ -69,6 +74,7 @@ public class BlockInfoUnderConstruction extends BlockInfo {
super(block); super(block);
this.expectedLocation = target; this.expectedLocation = target;
this.state = state; this.state = state;
this.chosenAsPrimary = false;
} }
/** /**
@ -88,6 +94,13 @@ public class BlockInfoUnderConstruction extends BlockInfo {
return state; return state;
} }
/**
* Whether the replica was chosen for recovery.
*/
boolean getChosenAsPrimary() {
return chosenAsPrimary;
}
/** /**
* Set replica state. * Set replica state.
*/ */
@ -95,6 +108,13 @@ public class BlockInfoUnderConstruction extends BlockInfo {
state = s; state = s;
} }
/**
* Set whether this replica was chosen for recovery.
*/
void setChosenAsPrimary(boolean chosenAsPrimary) {
this.chosenAsPrimary = chosenAsPrimary;
}
/** /**
* Is data-node the replica belongs to alive. * Is data-node the replica belongs to alive.
*/ */
@ -237,19 +257,40 @@ public class BlockInfoUnderConstruction extends BlockInfo {
+ " BlockInfoUnderConstruction.initLeaseRecovery:" + " BlockInfoUnderConstruction.initLeaseRecovery:"
+ " No blocks found, lease removed."); + " No blocks found, lease removed.");
} }
boolean allLiveReplicasTriedAsPrimary = true;
int previous = primaryNodeIndex; for (int i = 0; i < replicas.size(); i++) {
for(int i = 1; i <= replicas.size(); i++) { // Check if all replicas have been tried or not.
int j = (previous + i)%replicas.size(); if (replicas.get(i).isAlive()) {
if (replicas.get(j).isAlive()) { allLiveReplicasTriedAsPrimary =
primaryNodeIndex = j; (allLiveReplicasTriedAsPrimary && replicas.get(i).getChosenAsPrimary());
DatanodeDescriptor primary = replicas.get(j).getExpectedLocation();
primary.addBlockToBeRecovered(this);
NameNode.blockStateChangeLog.info("BLOCK* " + this
+ " recovery started, primary=" + primary);
return;
} }
} }
if (allLiveReplicasTriedAsPrimary) {
// Just set all the replicas to be chosen whether they are alive or not.
for (int i = 0; i < replicas.size(); i++) {
replicas.get(i).setChosenAsPrimary(false);
}
}
long mostRecentLastUpdate = 0;
ReplicaUnderConstruction primary = null;
primaryNodeIndex = -1;
for(int i = 0; i < replicas.size(); i++) {
// Skip alive replicas which have been chosen for recovery.
if (!(replicas.get(i).isAlive() && !replicas.get(i).getChosenAsPrimary())) {
continue;
}
if (replicas.get(i).getExpectedLocation().getLastUpdate() > mostRecentLastUpdate) {
primary = replicas.get(i);
primaryNodeIndex = i;
mostRecentLastUpdate = primary.getExpectedLocation().getLastUpdate();
}
}
if (primary != null) {
primary.getExpectedLocation().addBlockToBeRecovered(this);
primary.setChosenAsPrimary(true);
NameNode.blockStateChangeLog.info("BLOCK* " + this
+ " recovery started, primary=" + primary);
}
} }
void addReplicaIfNotPresent(DatanodeDescriptor dn, void addReplicaIfNotPresent(DatanodeDescriptor dn,

View File

@ -59,6 +59,7 @@ import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.namenode.FSClusterStats; import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.namenode.Namesystem;
@ -72,6 +73,7 @@ import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
import org.apache.hadoop.hdfs.util.LightWeightLinkedSet; import org.apache.hadoop.hdfs.util.LightWeightLinkedSet;
import org.apache.hadoop.net.Node; import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
@ -1333,11 +1335,12 @@ public class BlockManager {
public DatanodeDescriptor[] chooseTarget(final String src, public DatanodeDescriptor[] chooseTarget(final String src,
final int numOfReplicas, final DatanodeDescriptor client, final int numOfReplicas, final DatanodeDescriptor client,
final HashMap<Node, Node> excludedNodes, final HashMap<Node, Node> excludedNodes,
final long blocksize) throws IOException { final long blocksize, List<String> favoredNodes) throws IOException {
// choose targets for the new block to be allocated. List<DatanodeDescriptor> favoredDatanodeDescriptors =
getDatanodeDescriptors(favoredNodes);
final DatanodeDescriptor targets[] = blockplacement.chooseTarget(src, final DatanodeDescriptor targets[] = blockplacement.chooseTarget(src,
numOfReplicas, client, new ArrayList<DatanodeDescriptor>(), false, numOfReplicas, client, excludedNodes, blocksize,
excludedNodes, blocksize); favoredDatanodeDescriptors);
if (targets.length < minReplication) { if (targets.length < minReplication) {
throw new IOException("File " + src + " could only be replicated to " throw new IOException("File " + src + " could only be replicated to "
+ targets.length + " nodes instead of minReplication (=" + targets.length + " nodes instead of minReplication (="
@ -1350,6 +1353,24 @@ public class BlockManager {
return targets; return targets;
} }
/**
* Get list of datanode descriptors for given list of nodes. Nodes are
* hostaddress:port or just hostaddress.
*/
List<DatanodeDescriptor> getDatanodeDescriptors(List<String> nodes) {
List<DatanodeDescriptor> datanodeDescriptors = null;
if (nodes != null) {
datanodeDescriptors = new ArrayList<DatanodeDescriptor>(nodes.size());
for (int i = 0; i < nodes.size(); i++) {
DatanodeDescriptor node = datanodeManager.getDatanodeDescriptor(nodes.get(i));
if (node != null) {
datanodeDescriptors.add(node);
}
}
}
return datanodeDescriptors;
}
/** /**
* Parse the data-nodes the block belongs to and choose one, * Parse the data-nodes the block belongs to and choose one,
* which will be the replication source. * which will be the replication source.

View File

@ -118,6 +118,25 @@ public abstract class BlockPlacementPolicy {
return chooseTarget(srcBC.getName(), numOfReplicas, writer, return chooseTarget(srcBC.getName(), numOfReplicas, writer,
chosenNodes, false, excludedNodes, blocksize); chosenNodes, false, excludedNodes, blocksize);
} }
/**
* Same as {@link #chooseTarget(String, int, DatanodeDescriptor, List, boolean,
* HashMap, long)} with added parameter {@code favoredDatanodes}
* @param favoredNodes datanodes that should be favored as targets. This
* is only a hint and due to cluster state, namenode may not be
* able to place the blocks on these datanodes.
*/
DatanodeDescriptor[] chooseTarget(String src,
int numOfReplicas, DatanodeDescriptor writer,
HashMap<Node, Node> excludedNodes,
long blocksize, List<DatanodeDescriptor> favoredNodes) {
// This class does not provide the functionality of placing
// a block in favored datanodes. The implementations of this class
// are expected to provide this functionality
return chooseTarget(src, numOfReplicas, writer,
new ArrayList<DatanodeDescriptor>(numOfReplicas), false, excludedNodes,
blocksize);
}
/** /**
* Verify that the block is replicated on at least minRacks different racks * Verify that the block is replicated on at least minRacks different racks

View File

@ -125,6 +125,60 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
excludedNodes, blocksize); excludedNodes, blocksize);
} }
@Override
DatanodeDescriptor[] chooseTarget(String src, int numOfReplicas,
DatanodeDescriptor writer, HashMap<Node, Node> excludedNodes,
long blocksize, List<DatanodeDescriptor> favoredNodes) {
try {
if (favoredNodes == null || favoredNodes.size() == 0) {
// Favored nodes not specified, fall back to regular block placement.
return chooseTarget(src, numOfReplicas, writer,
new ArrayList<DatanodeDescriptor>(numOfReplicas), false,
excludedNodes, blocksize);
}
HashMap<Node, Node> favoriteAndExcludedNodes = excludedNodes == null ?
new HashMap<Node, Node>() : new HashMap<Node, Node>(excludedNodes);
// Choose favored nodes
List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>();
boolean avoidStaleNodes = stats != null
&& stats.isAvoidingStaleDataNodesForWrite();
for (int i = 0; i < Math.min(favoredNodes.size(), numOfReplicas); i++) {
DatanodeDescriptor favoredNode = favoredNodes.get(i);
// Choose a single node which is local to favoredNode.
// 'results' is updated within chooseLocalNode
DatanodeDescriptor target = chooseLocalNode(favoredNode,
favoriteAndExcludedNodes, blocksize,
getMaxNodesPerRack(results,
numOfReplicas)[1], results, avoidStaleNodes);
if (target == null) {
LOG.warn("Could not find a target for file " + src
+ " with favored node " + favoredNode);
continue;
}
favoriteAndExcludedNodes.put(target, target);
}
if (results.size() < numOfReplicas) {
// Not enough favored nodes, choose other nodes.
numOfReplicas -= results.size();
DatanodeDescriptor[] remainingTargets =
chooseTarget(src, numOfReplicas, writer, results,
false, favoriteAndExcludedNodes, blocksize);
for (int i = 0; i < remainingTargets.length; i++) {
results.add(remainingTargets[i]);
}
}
return results.toArray(new DatanodeDescriptor[results.size()]);
} catch (NotEnoughReplicasException nr) {
// Fall back to regular block placement disregarding favored nodes hint
return chooseTarget(src, numOfReplicas, writer,
new ArrayList<DatanodeDescriptor>(numOfReplicas), false,
excludedNodes, blocksize);
}
}
/** This is the implementation. */ /** This is the implementation. */
DatanodeDescriptor[] chooseTarget(int numOfReplicas, DatanodeDescriptor[] chooseTarget(int numOfReplicas,
DatanodeDescriptor writer, DatanodeDescriptor writer,
@ -140,15 +194,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
excludedNodes = new HashMap<Node, Node>(); excludedNodes = new HashMap<Node, Node>();
} }
int clusterSize = clusterMap.getNumOfLeaves(); int[] result = getMaxNodesPerRack(chosenNodes, numOfReplicas);
int totalNumOfReplicas = chosenNodes.size()+numOfReplicas; numOfReplicas = result[0];
if (totalNumOfReplicas > clusterSize) { int maxNodesPerRack = result[1];
numOfReplicas -= (totalNumOfReplicas-clusterSize);
totalNumOfReplicas = clusterSize;
}
int maxNodesPerRack =
(totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;
List<DatanodeDescriptor> results = List<DatanodeDescriptor> results =
new ArrayList<DatanodeDescriptor>(chosenNodes); new ArrayList<DatanodeDescriptor>(chosenNodes);
@ -174,6 +222,18 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
return getPipeline((writer==null)?localNode:writer, return getPipeline((writer==null)?localNode:writer,
results.toArray(new DatanodeDescriptor[results.size()])); results.toArray(new DatanodeDescriptor[results.size()]));
} }
private int[] getMaxNodesPerRack(List<DatanodeDescriptor> chosenNodes,
int numOfReplicas) {
int clusterSize = clusterMap.getNumOfLeaves();
int totalNumOfReplicas = chosenNodes.size()+numOfReplicas;
if (totalNumOfReplicas > clusterSize) {
numOfReplicas -= (totalNumOfReplicas-clusterSize);
totalNumOfReplicas = clusterSize;
}
int maxNodesPerRack = (totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;
return new int[] {numOfReplicas, maxNodesPerRack};
}
/* choose <i>numOfReplicas</i> from all data nodes */ /* choose <i>numOfReplicas</i> from all data nodes */
private DatanodeDescriptor chooseTarget(int numOfReplicas, private DatanodeDescriptor chooseTarget(int numOfReplicas,

View File

@ -213,7 +213,7 @@ public class DatanodeManager {
" = '" + ratioUseStaleDataNodesForWrite + "' is invalid. " + " = '" + ratioUseStaleDataNodesForWrite + "' is invalid. " +
"It should be a positive non-zero float value, not greater than 1.0f."); "It should be a positive non-zero float value, not greater than 1.0f.");
} }
private static long getStaleIntervalFromConf(Configuration conf, private static long getStaleIntervalFromConf(Configuration conf,
long heartbeatExpireInterval) { long heartbeatExpireInterval) {
long staleInterval = conf.getLong( long staleInterval = conf.getLong(
@ -326,6 +326,68 @@ public class DatanodeManager {
return host2DatanodeMap.getDatanodeByHost(host); return host2DatanodeMap.getDatanodeByHost(host);
} }
/** @return the datanode descriptor for the host. */
public DatanodeDescriptor getDatanodeByXferAddr(String host, int xferPort) {
return host2DatanodeMap.getDatanodeByXferAddr(host, xferPort);
}
/**
* Given datanode address or host name, returns the DatanodeDescriptor for the
* same, or if it doesn't find the datanode, it looks for a machine local and
* then rack local datanode, if a rack local datanode is not possible either,
* it returns the DatanodeDescriptor of any random node in the cluster.
*
* @param address hostaddress:transfer address
* @return the best match for the given datanode
* @throws IOException when no datanode is found for given address
*/
DatanodeDescriptor getDatanodeDescriptor(String address) {
DatanodeDescriptor node = null;
int colon = address.indexOf(":");
int xferPort;
String host = address;
if (colon > 0) {
host = address.substring(0, colon);
xferPort = Integer.parseInt(address.substring(colon+1));
node = getDatanodeByXferAddr(host, xferPort);
}
if (node == null) {
node = getDatanodeByHost(host);
}
if (node == null) {
String networkLocation = resolveNetworkLocation(host);
// If the current cluster doesn't contain the node, fallback to
// something machine local and then rack local.
List<Node> rackNodes = getNetworkTopology()
.getDatanodesInRack(networkLocation);
if (rackNodes != null) {
// Try something machine local.
for (Node rackNode : rackNodes) {
if (((DatanodeDescriptor) rackNode).getIpAddr().equals(host)) {
node = (DatanodeDescriptor) rackNode;
break;
}
}
// Try something rack local.
if (node == null && !rackNodes.isEmpty()) {
node = (DatanodeDescriptor) (rackNodes
.get(DFSUtil.getRandom().nextInt(rackNodes.size())));
}
}
// If we can't even choose rack local, just choose any node in the
// cluster.
if (node == null) {
node = (DatanodeDescriptor)getNetworkTopology()
.chooseRandom(NodeBase.ROOT);
}
}
return node;
}
/** Get a datanode descriptor given corresponding storageID */ /** Get a datanode descriptor given corresponding storageID */
DatanodeDescriptor getDatanode(final String storageID) { DatanodeDescriptor getDatanode(final String storageID) {
return datanodeMap.get(storageID); return datanodeMap.get(storageID);
@ -455,8 +517,13 @@ public class DatanodeManager {
} }
} }
public String resolveNetworkLocation(String host) {
DatanodeID d = parseDNFromHostsEntry(host);
return resolveNetworkLocation(d);
}
/* Resolve a node's network location */ /* Resolve a node's network location */
private void resolveNetworkLocation (DatanodeDescriptor node) { private String resolveNetworkLocation (DatanodeID node) {
List<String> names = new ArrayList<String>(1); List<String> names = new ArrayList<String>(1);
if (dnsToSwitchMapping instanceof CachedDNSToSwitchMapping) { if (dnsToSwitchMapping instanceof CachedDNSToSwitchMapping) {
names.add(node.getIpAddr()); names.add(node.getIpAddr());
@ -474,7 +541,7 @@ public class DatanodeManager {
} else { } else {
networkLocation = rName.get(0); networkLocation = rName.get(0);
} }
node.setNetworkLocation(networkLocation); return networkLocation;
} }
private boolean inHostsList(DatanodeID node) { private boolean inHostsList(DatanodeID node) {
@ -707,7 +774,7 @@ public class DatanodeManager {
nodeS.setDisallowed(false); // Node is in the include list nodeS.setDisallowed(false); // Node is in the include list
// resolve network location // resolve network location
resolveNetworkLocation(nodeS); nodeS.setNetworkLocation(resolveNetworkLocation(nodeS));
getNetworkTopology().add(nodeS); getNetworkTopology().add(nodeS);
// also treat the registration message as a heartbeat // also treat the registration message as a heartbeat
@ -739,7 +806,7 @@ public class DatanodeManager {
= new DatanodeDescriptor(nodeReg, NetworkTopology.DEFAULT_RACK); = new DatanodeDescriptor(nodeReg, NetworkTopology.DEFAULT_RACK);
boolean success = false; boolean success = false;
try { try {
resolveNetworkLocation(nodeDescr); nodeDescr.setNetworkLocation(resolveNetworkLocation(nodeDescr));
networktopology.add(nodeDescr); networktopology.add(nodeDescr);
// register new datanode // register new datanode
@ -875,7 +942,7 @@ public class DatanodeManager {
(numStaleNodes <= heartbeatManager.getLiveDatanodeCount() (numStaleNodes <= heartbeatManager.getLiveDatanodeCount()
* ratioUseStaleDataNodesForWrite); * ratioUseStaleDataNodesForWrite);
} }
/** /**
* @return The time interval used to mark DataNodes as stale. * @return The time interval used to mark DataNodes as stale.
*/ */
@ -1093,7 +1160,7 @@ public class DatanodeManager {
* failed. As a special case, the loopback address is also considered * failed. As a special case, the loopback address is also considered
* acceptable. This is particularly important on Windows, where 127.0.0.1 does * acceptable. This is particularly important on Windows, where 127.0.0.1 does
* not resolve to "localhost". * not resolve to "localhost".
* *
* @param address InetAddress to check * @param address InetAddress to check
* @return boolean true if name resolution successful or address is loopback * @return boolean true if name resolution successful or address is loopback
*/ */
@ -1127,7 +1194,7 @@ public class DatanodeManager {
setDatanodeDead(nodeinfo); setDatanodeDead(nodeinfo);
throw new DisallowedDatanodeException(nodeinfo); throw new DisallowedDatanodeException(nodeinfo);
} }
if (nodeinfo == null || !nodeinfo.isAlive) { if (nodeinfo == null || !nodeinfo.isAlive) {
return new DatanodeCommand[]{RegisterCommand.REGISTER}; return new DatanodeCommand[]{RegisterCommand.REGISTER};
} }
@ -1142,9 +1209,34 @@ public class DatanodeManager {
BlockRecoveryCommand brCommand = new BlockRecoveryCommand( BlockRecoveryCommand brCommand = new BlockRecoveryCommand(
blocks.length); blocks.length);
for (BlockInfoUnderConstruction b : blocks) { for (BlockInfoUnderConstruction b : blocks) {
brCommand.add(new RecoveringBlock( DatanodeDescriptor[] expectedLocations = b.getExpectedLocations();
new ExtendedBlock(blockPoolId, b), b.getExpectedLocations(), b // Skip stale nodes during recovery - not heart beated for some time (30s by default).
.getBlockRecoveryId())); List<DatanodeDescriptor> recoveryLocations =
new ArrayList<DatanodeDescriptor>(expectedLocations.length);
for (int i = 0; i < expectedLocations.length; i++) {
if (!expectedLocations[i].isStale(this.staleInterval)) {
recoveryLocations.add(expectedLocations[i]);
}
}
// If we only get 1 replica after eliminating stale nodes, then choose all
// replicas for recovery and let the primary data node handle failures.
if (recoveryLocations.size() > 1) {
if (recoveryLocations.size() != expectedLocations.length) {
LOG.info("Skipped stale nodes for recovery : " +
(expectedLocations.length - recoveryLocations.size()));
}
brCommand.add(new RecoveringBlock(
new ExtendedBlock(blockPoolId, b),
recoveryLocations.toArray(new DatanodeDescriptor[recoveryLocations.size()]),
b.getBlockRecoveryId()));
} else {
// If too many replicas are stale, then choose all replicas to participate
// in block recovery.
brCommand.add(new RecoveringBlock(
new ExtendedBlock(blockPoolId, b),
expectedLocations,
b.getBlockRecoveryId()));
}
} }
return new DatanodeCommand[] { brCommand }; return new DatanodeCommand[] { brCommand };
} }

View File

@ -2452,6 +2452,8 @@ public class FSDirectory implements Closeable {
try { try {
setReady(false); setReady(false);
rootDir = createRoot(getFSNamesystem()); rootDir = createRoot(getFSNamesystem());
inodeMap.clear();
addToInodeMapUnprotected(rootDir);
nameCache.reset(); nameCache.reset();
} finally { } finally {
writeUnlock(); writeUnlock();

View File

@ -2228,7 +2228,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
* client to "try again later". * client to "try again later".
*/ */
LocatedBlock getAdditionalBlock(String src, long fileId, String clientName, LocatedBlock getAdditionalBlock(String src, long fileId, String clientName,
ExtendedBlock previous, HashMap<Node, Node> excludedNodes) ExtendedBlock previous, HashMap<Node, Node> excludedNodes,
List<String> favoredNodes)
throws LeaseExpiredException, NotReplicatedYetException, throws LeaseExpiredException, NotReplicatedYetException,
QuotaExceededException, SafeModeException, UnresolvedLinkException, QuotaExceededException, SafeModeException, UnresolvedLinkException,
IOException { IOException {
@ -2268,8 +2269,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
} }
// choose targets for the new block to be allocated. // choose targets for the new block to be allocated.
final DatanodeDescriptor targets[] = getBlockManager().chooseTarget( final DatanodeDescriptor targets[] = getBlockManager().chooseTarget(
src, replication, clientNode, excludedNodes, blockSize); src, replication, clientNode, excludedNodes, blockSize, favoredNodes);
// Part II. // Part II.
// Allocate a new block, add it to the INode and the BlocksMap. // Allocate a new block, add it to the INode and the BlocksMap.

View File

@ -29,6 +29,7 @@ import java.net.InetSocketAddress;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -484,7 +485,8 @@ class NameNodeRpcServer implements NamenodeProtocols {
@Override @Override
public LocatedBlock addBlock(String src, String clientName, public LocatedBlock addBlock(String src, String clientName,
ExtendedBlock previous, DatanodeInfo[] excludedNodes, long fileId) ExtendedBlock previous, DatanodeInfo[] excludedNodes, long fileId,
String[] favoredNodes)
throws IOException { throws IOException {
if (stateChangeLog.isDebugEnabled()) { if (stateChangeLog.isDebugEnabled()) {
stateChangeLog.debug("*BLOCK* NameNode.addBlock: file " + src stateChangeLog.debug("*BLOCK* NameNode.addBlock: file " + src
@ -497,8 +499,10 @@ class NameNodeRpcServer implements NamenodeProtocols {
excludedNodesSet.put(node, node); excludedNodesSet.put(node, node);
} }
} }
List<String> favoredNodesList = (favoredNodes == null) ? null
: Arrays.asList(favoredNodes);
LocatedBlock locatedBlock = namesystem.getAdditionalBlock(src, fileId, LocatedBlock locatedBlock = namesystem.getAdditionalBlock(src, fileId,
clientName, previous, excludedNodesSet); clientName, previous, excludedNodesSet, favoredNodesList);
if (locatedBlock != null) if (locatedBlock != null)
metrics.incrAddBlockOps(); metrics.incrAddBlockOps();
return locatedBlock; return locatedBlock;

View File

@ -121,6 +121,7 @@ message AddBlockRequestProto {
optional ExtendedBlockProto previous = 3; optional ExtendedBlockProto previous = 3;
repeated DatanodeInfoProto excludeNodes = 4; repeated DatanodeInfoProto excludeNodes = 4;
optional uint64 fileId = 5 [default = 0]; // default as a bogus id optional uint64 fileId = 5 [default = 0]; // default as a bogus id
repeated string favoredNodes = 6; //the set of datanodes to use for the block
} }
message AddBlockResponseProto { message AddBlockResponseProto {

View File

@ -1072,7 +1072,10 @@
otherwise this may cause too frequent change of stale states. otherwise this may cause too frequent change of stale states.
We thus set a minimum stale interval value (the default value is 3 times We thus set a minimum stale interval value (the default value is 3 times
of heartbeat interval) and guarantee that the stale interval cannot be less of heartbeat interval) and guarantee that the stale interval cannot be less
than the minimum value. than the minimum value. A stale data node is avoided during lease/block
recovery. It can be conditionally avoided for reads (see
dfs.namenode.avoid.read.stale.datanode) and for writes (see
dfs.namenode.avoid.write.stale.datanode).
</description> </description>
</property> </property>

View File

@ -241,7 +241,7 @@ public class TestDFSClientRetries {
anyString(), anyString(),
any(ExtendedBlock.class), any(ExtendedBlock.class),
any(DatanodeInfo[].class), any(DatanodeInfo[].class),
anyLong())).thenAnswer(answer); anyLong(), any(String[].class))).thenAnswer(answer);
Mockito.doReturn( Mockito.doReturn(
new HdfsFileStatus(0, false, 1, 1024, 0, 0, new FsPermission( new HdfsFileStatus(0, false, 1, 1024, 0, 0, new FsPermission(
@ -390,7 +390,7 @@ public class TestDFSClientRetries {
} }
}).when(spyNN).addBlock(Mockito.anyString(), Mockito.anyString(), }).when(spyNN).addBlock(Mockito.anyString(), Mockito.anyString(),
Mockito.<ExtendedBlock> any(), Mockito.<DatanodeInfo[]> any(), Mockito.<ExtendedBlock> any(), Mockito.<DatanodeInfo[]> any(),
Mockito.anyLong()); Mockito.anyLong(), Mockito.<String[]> any());
doAnswer(new Answer<Boolean>() { doAnswer(new Answer<Boolean>() {
@ -432,7 +432,7 @@ public class TestDFSClientRetries {
Mockito.verify(spyNN, Mockito.atLeastOnce()).addBlock( Mockito.verify(spyNN, Mockito.atLeastOnce()).addBlock(
Mockito.anyString(), Mockito.anyString(), Mockito.anyString(), Mockito.anyString(),
Mockito.<ExtendedBlock> any(), Mockito.<DatanodeInfo[]> any(), Mockito.<ExtendedBlock> any(), Mockito.<DatanodeInfo[]> any(),
Mockito.anyLong()); Mockito.anyLong(), Mockito.<String[]> any());
Mockito.verify(spyNN, Mockito.atLeastOnce()).complete( Mockito.verify(spyNN, Mockito.atLeastOnce()).complete(
Mockito.anyString(), Mockito.anyString(), Mockito.anyString(), Mockito.anyString(),
Mockito.<ExtendedBlock>any()); Mockito.<ExtendedBlock>any());

View File

@ -519,7 +519,7 @@ public class TestFileCreation {
// add one block to the file // add one block to the file
LocatedBlock location = client.getNamenode().addBlock(file1.toString(), LocatedBlock location = client.getNamenode().addBlock(file1.toString(),
client.clientName, null, null, INodeId.GRANDFATHER_INODE_ID); client.clientName, null, null, INodeId.GRANDFATHER_INODE_ID, null);
System.out.println("testFileCreationError2: " System.out.println("testFileCreationError2: "
+ "Added block " + location.getBlock()); + "Added block " + location.getBlock());
@ -570,7 +570,7 @@ public class TestFileCreation {
createFile(dfs, f, 3); createFile(dfs, f, 3);
try { try {
cluster.getNameNodeRpc().addBlock(f.toString(), client.clientName, cluster.getNameNodeRpc().addBlock(f.toString(), client.clientName,
null, null, INodeId.GRANDFATHER_INODE_ID); null, null, INodeId.GRANDFATHER_INODE_ID, null);
fail(); fail();
} catch(IOException ioe) { } catch(IOException ioe) {
FileSystem.LOG.info("GOOD!", ioe); FileSystem.LOG.info("GOOD!", ioe);

View File

@ -0,0 +1,86 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
import static org.junit.Assert.assertEquals;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.junit.Test;
/**
* This class provides tests for BlockInfoUnderConstruction class
*/
public class TestBlockInfoUnderConstruction {
@Test
public void testInitializeBlockRecovery() throws Exception {
DatanodeDescriptor dd1 = DFSTestUtil.getDatanodeDescriptor("10.10.1.1",
"default");
DatanodeDescriptor dd2 = DFSTestUtil.getDatanodeDescriptor("10.10.1.2",
"default");
DatanodeDescriptor dd3 = DFSTestUtil.getDatanodeDescriptor("10.10.1.3",
"default");
dd1.isAlive = dd2.isAlive = dd3.isAlive = true;
BlockInfoUnderConstruction blockInfo = new BlockInfoUnderConstruction(
new Block(0, 0, GenerationStamp.LAST_RESERVED_STAMP),
3,
BlockUCState.UNDER_CONSTRUCTION,
new DatanodeDescriptor[] {dd1, dd2, dd3});
// Recovery attempt #1.
long currentTime = System.currentTimeMillis();
dd1.setLastUpdate(currentTime - 3 * 1000);
dd2.setLastUpdate(currentTime - 1 * 1000);
dd3.setLastUpdate(currentTime - 2 * 1000);
blockInfo.initializeBlockRecovery(1);
BlockInfoUnderConstruction[] blockInfoRecovery = dd2.getLeaseRecoveryCommand(1);
assertEquals(blockInfoRecovery[0], blockInfo);
// Recovery attempt #2.
currentTime = System.currentTimeMillis();
dd1.setLastUpdate(currentTime - 2 * 1000);
dd2.setLastUpdate(currentTime - 1 * 1000);
dd3.setLastUpdate(currentTime - 3 * 1000);
blockInfo.initializeBlockRecovery(2);
blockInfoRecovery = dd1.getLeaseRecoveryCommand(1);
assertEquals(blockInfoRecovery[0], blockInfo);
// Recovery attempt #3.
currentTime = System.currentTimeMillis();
dd1.setLastUpdate(currentTime - 2 * 1000);
dd2.setLastUpdate(currentTime - 1 * 1000);
dd3.setLastUpdate(currentTime - 3 * 1000);
currentTime = System.currentTimeMillis();
blockInfo.initializeBlockRecovery(3);
blockInfoRecovery = dd3.getLeaseRecoveryCommand(1);
assertEquals(blockInfoRecovery[0], blockInfo);
// Recovery attempt #4.
// Reset everything. And again pick DN with most recent heart beat.
currentTime = System.currentTimeMillis();
dd1.setLastUpdate(currentTime - 2 * 1000);
dd2.setLastUpdate(currentTime - 1 * 1000);
dd3.setLastUpdate(currentTime);
currentTime = System.currentTimeMillis();
blockInfo.initializeBlockRecovery(3);
blockInfoRecovery = dd3.getLeaseRecoveryCommand(1);
assertEquals(blockInfoRecovery[0], blockInfo);
}
}

View File

@ -20,17 +20,21 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.common.GenerationStamp; import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol; import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
@ -56,14 +60,12 @@ public class TestHeartbeatHandling {
final HeartbeatManager hm = namesystem.getBlockManager( final HeartbeatManager hm = namesystem.getBlockManager(
).getDatanodeManager().getHeartbeatManager(); ).getDatanodeManager().getHeartbeatManager();
final String poolId = namesystem.getBlockPoolId(); final String poolId = namesystem.getBlockPoolId();
final DatanodeRegistration nodeReg = final DatanodeRegistration nodeReg =
DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(0), poolId); DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(0), poolId);
final DatanodeDescriptor dd = NameNodeAdapter.getDatanode(namesystem, nodeReg); final DatanodeDescriptor dd = NameNodeAdapter.getDatanode(namesystem, nodeReg);
final int REMAINING_BLOCKS = 1; final int REMAINING_BLOCKS = 1;
final int MAX_REPLICATE_LIMIT = final int MAX_REPLICATE_LIMIT =
conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 2); conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 2);
final int MAX_INVALIDATE_LIMIT = DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_DEFAULT; final int MAX_INVALIDATE_LIMIT = DFSConfigKeys.DFS_BLOCK_INVALIDATE_LIMIT_DEFAULT;
final int MAX_INVALIDATE_BLOCKS = 2*MAX_INVALIDATE_LIMIT+REMAINING_BLOCKS; final int MAX_INVALIDATE_BLOCKS = 2*MAX_INVALIDATE_LIMIT+REMAINING_BLOCKS;
@ -83,7 +85,7 @@ public class TestHeartbeatHandling {
assertEquals(1, cmds.length); assertEquals(1, cmds.length);
assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction()); assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction());
assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand)cmds[0]).getBlocks().length); assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand)cmds[0]).getBlocks().length);
ArrayList<Block> blockList = new ArrayList<Block>(MAX_INVALIDATE_BLOCKS); ArrayList<Block> blockList = new ArrayList<Block>(MAX_INVALIDATE_BLOCKS);
for (int i=0; i<MAX_INVALIDATE_BLOCKS; i++) { for (int i=0; i<MAX_INVALIDATE_BLOCKS; i++) {
blockList.add(new Block(i, 0, GenerationStamp.LAST_RESERVED_STAMP)); blockList.add(new Block(i, 0, GenerationStamp.LAST_RESERVED_STAMP));
@ -122,4 +124,113 @@ public class TestHeartbeatHandling {
cluster.shutdown(); cluster.shutdown();
} }
} }
/**
* Test if
* {@link FSNamesystem#handleHeartbeat}
* correctly selects data node targets for block recovery.
*/
@Test
public void testHeartbeatBlockRecovery() throws Exception {
final Configuration conf = new HdfsConfiguration();
final MiniDFSCluster cluster =
new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
try {
cluster.waitActive();
final FSNamesystem namesystem = cluster.getNamesystem();
final HeartbeatManager hm = namesystem.getBlockManager(
).getDatanodeManager().getHeartbeatManager();
final String poolId = namesystem.getBlockPoolId();
final DatanodeRegistration nodeReg1 =
DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(0), poolId);
final DatanodeDescriptor dd1 = NameNodeAdapter.getDatanode(namesystem, nodeReg1);
final DatanodeRegistration nodeReg2 =
DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(1), poolId);
final DatanodeDescriptor dd2 = NameNodeAdapter.getDatanode(namesystem, nodeReg2);
final DatanodeRegistration nodeReg3 =
DataNodeTestUtils.getDNRegistrationForBP(cluster.getDataNodes().get(2), poolId);
final DatanodeDescriptor dd3 = NameNodeAdapter.getDatanode(namesystem, nodeReg3);
try {
namesystem.writeLock();
synchronized(hm) {
NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem);
NameNodeAdapter.sendHeartBeat(nodeReg2, dd2, namesystem);
NameNodeAdapter.sendHeartBeat(nodeReg3, dd3, namesystem);
// Test with all alive nodes.
dd1.setLastUpdate(System.currentTimeMillis());
dd2.setLastUpdate(System.currentTimeMillis());
dd3.setLastUpdate(System.currentTimeMillis());
BlockInfoUnderConstruction blockInfo = new BlockInfoUnderConstruction(
new Block(0, 0, GenerationStamp.LAST_RESERVED_STAMP), 3,
BlockUCState.UNDER_RECOVERY,
new DatanodeDescriptor[] {dd1, dd2, dd3});
dd1.addBlockToBeRecovered(blockInfo);
DatanodeCommand[] cmds =
NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem).getCommands();
assertEquals(1, cmds.length);
assertEquals(DatanodeProtocol.DNA_RECOVERBLOCK, cmds[0].getAction());
BlockRecoveryCommand recoveryCommand = (BlockRecoveryCommand)cmds[0];
assertEquals(1, recoveryCommand.getRecoveringBlocks().size());
DatanodeInfo[] recoveringNodes = recoveryCommand.getRecoveringBlocks()
.toArray(new BlockRecoveryCommand.RecoveringBlock[0])[0].getLocations();
assertEquals(3, recoveringNodes.length);
assertEquals(recoveringNodes[0], (DatanodeInfo)dd1);
assertEquals(recoveringNodes[1], (DatanodeInfo)dd2);
assertEquals(recoveringNodes[2], (DatanodeInfo)dd3);
// Test with one stale node.
dd1.setLastUpdate(System.currentTimeMillis());
// More than the default stale interval of 30 seconds.
dd2.setLastUpdate(System.currentTimeMillis() - 40 * 1000);
dd3.setLastUpdate(System.currentTimeMillis());
blockInfo = new BlockInfoUnderConstruction(
new Block(0, 0, GenerationStamp.LAST_RESERVED_STAMP), 3,
BlockUCState.UNDER_RECOVERY,
new DatanodeDescriptor[] {dd1, dd2, dd3});
dd1.addBlockToBeRecovered(blockInfo);
cmds = NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem).getCommands();
assertEquals(1, cmds.length);
assertEquals(DatanodeProtocol.DNA_RECOVERBLOCK, cmds[0].getAction());
recoveryCommand = (BlockRecoveryCommand)cmds[0];
assertEquals(1, recoveryCommand.getRecoveringBlocks().size());
recoveringNodes = recoveryCommand.getRecoveringBlocks()
.toArray(new BlockRecoveryCommand.RecoveringBlock[0])[0].getLocations();
assertEquals(2, recoveringNodes.length);
// dd2 is skipped.
assertEquals(recoveringNodes[0], (DatanodeInfo)dd1);
assertEquals(recoveringNodes[1], (DatanodeInfo)dd3);
// Test with all stale node.
dd1.setLastUpdate(System.currentTimeMillis() - 60 * 1000);
// More than the default stale interval of 30 seconds.
dd2.setLastUpdate(System.currentTimeMillis() - 40 * 1000);
dd3.setLastUpdate(System.currentTimeMillis() - 80 * 1000);
blockInfo = new BlockInfoUnderConstruction(
new Block(0, 0, GenerationStamp.LAST_RESERVED_STAMP), 3,
BlockUCState.UNDER_RECOVERY,
new DatanodeDescriptor[] {dd1, dd2, dd3});
dd1.addBlockToBeRecovered(blockInfo);
cmds = NameNodeAdapter.sendHeartBeat(nodeReg1, dd1, namesystem).getCommands();
assertEquals(1, cmds.length);
assertEquals(DatanodeProtocol.DNA_RECOVERBLOCK, cmds[0].getAction());
recoveryCommand = (BlockRecoveryCommand)cmds[0];
assertEquals(1, recoveryCommand.getRecoveringBlocks().size());
recoveringNodes = recoveryCommand.getRecoveringBlocks()
.toArray(new BlockRecoveryCommand.RecoveringBlock[0])[0].getLocations();
// Only dd1 is included since it heart beated and hence its not stale
// when the list of recovery blocks is constructed.
assertEquals(3, recoveringNodes.length);
assertEquals(recoveringNodes[0], (DatanodeInfo)dd1);
assertEquals(recoveringNodes[1], (DatanodeInfo)dd2);
assertEquals(recoveringNodes[2], (DatanodeInfo)dd3);
}
} finally {
namesystem.writeUnlock();
}
} finally {
cluster.shutdown();
}
}
} }

View File

@ -1059,7 +1059,7 @@ public class NNThroughputBenchmark {
ExtendedBlock prevBlock = null; ExtendedBlock prevBlock = null;
for(int jdx = 0; jdx < blocksPerFile; jdx++) { for(int jdx = 0; jdx < blocksPerFile; jdx++) {
LocatedBlock loc = nameNodeProto.addBlock(fileName, clientName, LocatedBlock loc = nameNodeProto.addBlock(fileName, clientName,
prevBlock, null, INodeId.GRANDFATHER_INODE_ID); prevBlock, null, INodeId.GRANDFATHER_INODE_ID, null);
prevBlock = loc.getBlock(); prevBlock = loc.getBlock();
for(DatanodeInfo dnInfo : loc.getLocations()) { for(DatanodeInfo dnInfo : loc.getLocations()) {
int dnIdx = Arrays.binarySearch(datanodes, dnInfo.getXferAddr()); int dnIdx = Arrays.binarySearch(datanodes, dnInfo.getXferAddr());

View File

@ -26,6 +26,7 @@ import static org.mockito.Mockito.spy;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -108,7 +109,7 @@ public class TestAddBlockRetry {
if(count == 1) { // run second addBlock() if(count == 1) { // run second addBlock()
LOG.info("Starting second addBlock for " + src); LOG.info("Starting second addBlock for " + src);
nn.addBlock(src, "clientName", null, null, nn.addBlock(src, "clientName", null, null,
INodeId.GRANDFATHER_INODE_ID); INodeId.GRANDFATHER_INODE_ID, null);
LocatedBlocks lbs = nn.getBlockLocations(src, 0, Long.MAX_VALUE); LocatedBlocks lbs = nn.getBlockLocations(src, 0, Long.MAX_VALUE);
assertEquals("Must be one block", 1, lbs.getLocatedBlocks().size()); assertEquals("Must be one block", 1, lbs.getLocatedBlocks().size());
lb2 = lbs.get(0); lb2 = lbs.get(0);
@ -119,7 +120,7 @@ public class TestAddBlockRetry {
} }
}).when(spyBM).chooseTarget(Mockito.anyString(), Mockito.anyInt(), }).when(spyBM).chooseTarget(Mockito.anyString(), Mockito.anyInt(),
Mockito.<DatanodeDescriptor>any(), Mockito.<HashMap<Node, Node>>any(), Mockito.<DatanodeDescriptor>any(), Mockito.<HashMap<Node, Node>>any(),
Mockito.anyLong()); Mockito.anyLong(), Mockito.<List<String>>any());
// create file // create file
nn.create(src, FsPermission.getFileDefault(), nn.create(src, FsPermission.getFileDefault(),
@ -129,7 +130,7 @@ public class TestAddBlockRetry {
// start first addBlock() // start first addBlock()
LOG.info("Starting first addBlock for " + src); LOG.info("Starting first addBlock for " + src);
nn.addBlock(src, "clientName", null, null, INodeId.GRANDFATHER_INODE_ID); nn.addBlock(src, "clientName", null, null, INodeId.GRANDFATHER_INODE_ID, null);
// check locations // check locations
LocatedBlocks lbs = nn.getBlockLocations(src, 0, Long.MAX_VALUE); LocatedBlocks lbs = nn.getBlockLocations(src, 0, Long.MAX_VALUE);

View File

@ -77,6 +77,7 @@ import org.apache.hadoop.util.ExitUtil.ExitException;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.junit.After; import org.junit.After;
import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.mockito.ArgumentMatcher; import org.mockito.ArgumentMatcher;
@ -1061,6 +1062,10 @@ public class TestCheckpoint {
secondary = startSecondaryNameNode(conf); secondary = startSecondaryNameNode(conf);
secondary.doCheckpoint(); secondary.doCheckpoint();
FSDirectory secondaryFsDir = secondary.getFSNamesystem().dir;
INode rootInMap = secondaryFsDir.getInode(secondaryFsDir.rootDir.getId());
Assert.assertSame(rootInMap, secondaryFsDir.rootDir);
fileSys.delete(tmpDir, true); fileSys.delete(tmpDir, true);
fileSys.mkdirs(tmpDir); fileSys.mkdirs(tmpDir);
secondary.doCheckpoint(); secondary.doCheckpoint();

View File

@ -0,0 +1,220 @@
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import static org.junit.Assert.*;
import java.util.ArrayList;
import java.util.Random;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.client.HdfsDataOutputStream;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.junit.Test;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestFavoredNodesEndToEnd {
private static MiniDFSCluster cluster;
private static Configuration conf;
private final static int NUM_DATA_NODES = 10;
private final static int NUM_FILES = 10;
private final static byte[] SOME_BYTES = new String("foo").getBytes();
private static DistributedFileSystem dfs;
private static ArrayList<DataNode> datanodes;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
conf = new Configuration();
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATA_NODES)
.build();
cluster.waitClusterUp();
dfs = cluster.getFileSystem();
datanodes = cluster.getDataNodes();
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
if (cluster != null) {
cluster.shutdown();
}
}
@Test
public void testFavoredNodesEndToEnd() throws Exception {
//create 10 files with random preferred nodes
for (int i = 0; i < NUM_FILES; i++) {
Random rand = new Random(System.currentTimeMillis() + i);
//pass a new created rand so as to get a uniform distribution each time
//without too much collisions (look at the do-while loop in getDatanodes)
InetSocketAddress datanode[] = getDatanodes(rand);
Path p = new Path("/filename"+i);
FSDataOutputStream out = dfs.create(p, FsPermission.getDefault(), true,
4096, (short)3, (long)4096, null, datanode);
out.write(SOME_BYTES);
out.close();
BlockLocation[] locations =
dfs.getClient().getBlockLocations(p.toUri().getPath(), 0,
Long.MAX_VALUE);
//make sure we have exactly one block location, and three hosts
assertTrue(locations.length == 1 && locations[0].getHosts().length == 3);
//verify the files got created in the right nodes
for (BlockLocation loc : locations) {
String[] hosts = loc.getNames();
String[] hosts1 = getStringForInetSocketAddrs(datanode);
assertTrue(compareNodes(hosts, hosts1));
}
}
}
@Test
public void testWhenFavoredNodesNotPresent() throws Exception {
//when we ask for favored nodes but the nodes are not there, we should
//get some other nodes. In other words, the write to hdfs should not fail
//and if we do getBlockLocations on the file, we should see one blklocation
//and three hosts for that
Random rand = new Random(System.currentTimeMillis());
InetSocketAddress arbitraryAddrs[] = new InetSocketAddress[3];
for (int i = 0; i < 3; i++) {
arbitraryAddrs[i] = getArbitraryLocalHostAddr();
}
Path p = new Path("/filename-foo-bar");
FSDataOutputStream out = dfs.create(p, FsPermission.getDefault(), true,
4096, (short)3, (long)4096, null, arbitraryAddrs);
out.write(SOME_BYTES);
out.close();
BlockLocation[] locations =
dfs.getClient().getBlockLocations(p.toUri().getPath(), 0,
Long.MAX_VALUE);
assertTrue(locations.length == 1 && locations[0].getHosts().length == 3);
}
@Test
public void testWhenSomeNodesAreNotGood() throws Exception {
//make some datanode not "good" so that even if the client prefers it,
//the namenode would not give it as a replica to write to
DatanodeInfo d = cluster.getNameNode().getNamesystem().getBlockManager()
.getDatanodeManager().getDatanodeByXferAddr(
datanodes.get(0).getXferAddress().getAddress().getHostAddress(),
datanodes.get(0).getXferAddress().getPort());
//set the decommission status to true so that
//BlockPlacementPolicyDefault.isGoodTarget returns false for this dn
d.setDecommissioned();
InetSocketAddress addrs[] = new InetSocketAddress[3];
for (int i = 0; i < 3; i++) {
addrs[i] = datanodes.get(i).getXferAddress();
}
Path p = new Path("/filename-foo-bar-baz");
FSDataOutputStream out = dfs.create(p, FsPermission.getDefault(), true,
4096, (short)3, (long)4096, null, addrs);
out.write(SOME_BYTES);
out.close();
BlockLocation[] locations =
dfs.getClient().getBlockLocations(p.toUri().getPath(), 0,
Long.MAX_VALUE);
//reset the state
d.stopDecommission();
assertTrue(locations.length == 1 && locations[0].getHosts().length == 3);
//also make sure that the datanode[0] is not in the list of hosts
String datanode0 =
datanodes.get(0).getXferAddress().getAddress().getHostAddress()
+ ":" + datanodes.get(0).getXferAddress().getPort();
for (int i = 0; i < 3; i++) {
if (locations[0].getNames()[i].equals(datanode0)) {
fail(datanode0 + " not supposed to be a replica for the block");
}
}
}
private String[] getStringForInetSocketAddrs(InetSocketAddress[] datanode) {
String strs[] = new String[datanode.length];
for (int i = 0; i < datanode.length; i++) {
strs[i] = datanode[i].getAddress().getHostAddress() + ":" +
datanode[i].getPort();
}
return strs;
}
private boolean compareNodes(String[] dnList1, String[] dnList2) {
for (int i = 0; i < dnList1.length; i++) {
boolean matched = false;
for (int j = 0; j < dnList2.length; j++) {
if (dnList1[i].equals(dnList2[j])) {
matched = true;
break;
}
}
if (matched == false) {
fail(dnList1[i] + " not a favored node");
}
}
return true;
}
private InetSocketAddress[] getDatanodes(Random rand) {
//Get some unique random indexes
int idx1 = rand.nextInt(NUM_DATA_NODES);
int idx2;
do {
idx2 = rand.nextInt(NUM_DATA_NODES);
} while (idx1 == idx2);
int idx3;
do {
idx3 = rand.nextInt(NUM_DATA_NODES);
} while (idx2 == idx3 || idx1 == idx3);
InetSocketAddress[] addrs = new InetSocketAddress[3];
addrs[0] = datanodes.get(idx1).getXferAddress();
addrs[1] = datanodes.get(idx2).getXferAddress();
addrs[2] = datanodes.get(idx3).getXferAddress();
return addrs;
}
private InetSocketAddress getArbitraryLocalHostAddr()
throws UnknownHostException{
Random rand = new Random(System.currentTimeMillis());
int port = rand.nextInt(65535);
while (true) {
boolean conflict = false;
for (DataNode d : datanodes) {
if (d.getXferAddress().getPort() == port) {
port = rand.nextInt(65535);
conflict = true;
}
}
if (conflict == false) {
break;
}
}
return new InetSocketAddress(InetAddress.getLocalHost(), port);
}
}

View File

@ -521,9 +521,17 @@ public class TestPipelinesFailover {
storedBlock instanceof BlockInfoUnderConstruction); storedBlock instanceof BlockInfoUnderConstruction);
BlockInfoUnderConstruction ucBlock = BlockInfoUnderConstruction ucBlock =
(BlockInfoUnderConstruction)storedBlock; (BlockInfoUnderConstruction)storedBlock;
// We expect that the first indexed replica will be the one // We expect that the replica with the most recent heart beat will be
// to be in charge of the synchronization / recovery protocol. // the one to be in charge of the synchronization / recovery protocol.
DatanodeDescriptor expectedPrimary = ucBlock.getExpectedLocations()[0]; DatanodeDescriptor[] datanodes = ucBlock.getExpectedLocations();
DatanodeDescriptor expectedPrimary = datanodes[0];
long mostRecentLastUpdate = expectedPrimary.getLastUpdate();
for (int i = 1; i < datanodes.length; i++) {
if (datanodes[i].getLastUpdate() > mostRecentLastUpdate) {
expectedPrimary = datanodes[i];
mostRecentLastUpdate = expectedPrimary.getLastUpdate();
}
}
return expectedPrimary; return expectedPrimary;
} }

View File

@ -356,6 +356,9 @@ Release 2.0.5-beta - UNRELEASED
MAPREDUCE-5178. Update MR App to set progress in ApplicationReport after MAPREDUCE-5178. Update MR App to set progress in ApplicationReport after
YARN-577. (Hitesh Shah via vinodkv) YARN-577. (Hitesh Shah via vinodkv)
MAPREDUCE-5167. Update MR App after YARN-562 to use the new builder API
for the container. (Jian He via vinodkv)
Release 2.0.4-alpha - UNRELEASED Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -1094,12 +1094,12 @@ public abstract class TaskAttemptImpl implements
+ taInfo.getPort()); + taInfo.getPort());
String nodeHttpAddress = StringInterner.weakIntern(taInfo.getHostname() + ":" String nodeHttpAddress = StringInterner.weakIntern(taInfo.getHostname() + ":"
+ taInfo.getHttpPort()); + taInfo.getHttpPort());
// Resource/Priority/Tokens are only needed while launching the // Resource/Priority/Tokens and RMIdentifier are only needed while
// container on an NM, these are already completed tasks, so setting them to // launching the container on an NM, these are already completed tasks, so
// null // setting them to null and RMIdentifier as 0
container = container =
BuilderUtils.newContainer(containerId, containerNodeId, BuilderUtils.newContainer(containerId, containerNodeId,
nodeHttpAddress, null, null, null); nodeHttpAddress, null, null, null, 0);
computeRackAndLocality(); computeRackAndLocality();
launchTime = taInfo.getStartTime(); launchTime = taInfo.getStartTime();
finishTime = (taInfo.getFinishTime() != -1) ? finishTime = (taInfo.getFinishTime() != -1) ?

View File

@ -519,7 +519,7 @@ public class MRApp extends MRAppMaster {
cId.setId(containerCount++); cId.setId(containerCount++);
NodeId nodeId = BuilderUtils.newNodeId(NM_HOST, NM_PORT); NodeId nodeId = BuilderUtils.newNodeId(NM_HOST, NM_PORT);
Container container = BuilderUtils.newContainer(cId, nodeId, Container container = BuilderUtils.newContainer(cId, nodeId,
NM_HOST + ":" + NM_HTTP_PORT, null, null, null); NM_HOST + ":" + NM_HTTP_PORT, null, null, null, 0);
JobID id = TypeConverter.fromYarn(applicationId); JobID id = TypeConverter.fromYarn(applicationId);
JobId jobId = TypeConverter.toYarn(id); JobId jobId = TypeConverter.toYarn(id);
getContext().getEventHandler().handle(new JobHistoryEvent(jobId, getContext().getEventHandler().handle(new JobHistoryEvent(jobId,

View File

@ -243,7 +243,7 @@ public class MRAppBenchmark {
.newContainer(containerId, BuilderUtils.newNodeId("host" .newContainer(containerId, BuilderUtils.newNodeId("host"
+ containerId.getId(), 2345), + containerId.getId(), 2345),
"host" + containerId.getId() + ":5678", req "host" + containerId.getId() + ":5678", req
.getCapability(), req.getPriority(), null)); .getCapability(), req.getPriority(), null, 0));
} }
} }

View File

@ -187,6 +187,12 @@ Release 2.0.5-beta - UNRELEASED
YARN-595. Refactor fair scheduler to use common Resources. (Sandy Ryza YARN-595. Refactor fair scheduler to use common Resources. (Sandy Ryza
via tomwhite) via tomwhite)
YARN-562. Modified NM to reject any containers allocated by a previous
ResourceManager. (Jian He via vinodkv)
YARN-591. Moved RM recovery related records out of public API as they do not
belong there. (vinodkv)
OPTIMIZATIONS OPTIMIZATIONS
BUG FIXES BUG FIXES
@ -596,6 +602,9 @@ Release 0.23.8 - UNRELEASED
BUG FIXES BUG FIXES
YARN-363. Add webapps/proxy directory without which YARN proxy-server fails
when started in stand-alone mode. (Kenji Kikushima via vinodkv)
Release 0.23.7 - UNRELEASED Release 0.23.7 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -135,4 +135,16 @@ public interface Container extends Comparable<Container> {
@Private @Private
@Unstable @Unstable
void setContainerToken(ContainerToken containerToken); void setContainerToken(ContainerToken containerToken);
/**
* Get the RMIdentifier of RM in which containers are allocated
* @return RMIdentifier
*/
@Private
@Unstable
long getRMIdentifer();
@Private
@Unstable
void setRMIdentifier(long rmIdentifier);
} }

View File

@ -230,6 +230,18 @@ public class ContainerPBImpl extends ProtoBase<ContainerProto> implements Contai
this.containerToken = containerToken; this.containerToken = containerToken;
} }
@Override
public long getRMIdentifer() {
ContainerProtoOrBuilder p = viaProto ? proto : builder;
return p.getRmIdentifier();
}
@Override
public void setRMIdentifier(long rmIdentifier) {
maybeInitBuilder();
builder.setRmIdentifier((rmIdentifier));
}
private ContainerIdPBImpl convertFromProtoFormat(ContainerIdProto p) { private ContainerIdPBImpl convertFromProtoFormat(ContainerIdProto p) {
return new ContainerIdPBImpl(p); return new ContainerIdPBImpl(p);
} }

View File

@ -68,6 +68,7 @@ message ContainerProto {
optional ResourceProto resource = 4; optional ResourceProto resource = 4;
optional PriorityProto priority = 5; optional PriorityProto priority = 5;
optional hadoop.common.TokenProto container_token = 6; optional hadoop.common.TokenProto container_token = 6;
optional int64 rm_identifier = 7;
} }
enum YarnApplicationStateProto { enum YarnApplicationStateProto {
@ -311,16 +312,3 @@ message StringBytesMapProto {
optional string key = 1; optional string key = 1;
optional bytes value = 2; optional bytes value = 2;
} }
////////////////////////////////////////////////////////////////////////
////// From recovery////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
message ApplicationStateDataProto {
optional int64 submit_time = 1;
optional ApplicationSubmissionContextProto application_submission_context = 2;
}
message ApplicationAttemptStateDataProto {
optional ApplicationAttemptIdProto attemptId = 1;
optional ContainerProto master_container = 2;
}

View File

@ -21,6 +21,7 @@ option java_outer_classname = "YarnServerResourceManagerServiceProtos";
option java_generic_services = true; option java_generic_services = true;
option java_generate_equals_and_hash = true; option java_generate_equals_and_hash = true;
import "yarn_protos.proto";
message RefreshQueuesRequestProto { message RefreshQueuesRequestProto {
} }
@ -59,3 +60,16 @@ message GetGroupsForUserRequestProto {
message GetGroupsForUserResponseProto { message GetGroupsForUserResponseProto {
repeated string groups = 1; repeated string groups = 1;
} }
////////////////////////////////////////////////////////////////////////
////// RM recovery related records /////////////////////////////////////
////////////////////////////////////////////////////////////////////////
message ApplicationStateDataProto {
optional int64 submit_time = 1;
optional ApplicationSubmissionContextProto application_submission_context = 2;
}
message ApplicationAttemptStateDataProto {
optional ApplicationAttemptIdProto attemptId = 1;
optional ContainerProto master_container = 2;
}

View File

@ -56,7 +56,7 @@ public class TestAMRMClientAsync {
BuilderUtils.newContainerId(0, 0, 0, 0), BuilderUtils.newContainerId(0, 0, 0, 0),
ContainerState.COMPLETE, "", 0)); ContainerState.COMPLETE, "", 0));
List<Container> allocated1 = Arrays.asList( List<Container> allocated1 = Arrays.asList(
BuilderUtils.newContainer(null, null, null, null, null, null)); BuilderUtils.newContainer(null, null, null, null, null, null, 0));
final AllocateResponse response1 = createAllocateResponse( final AllocateResponse response1 = createAllocateResponse(
new ArrayList<ContainerStatus>(), allocated1); new ArrayList<ContainerStatus>(), allocated1);
final AllocateResponse response2 = createAllocateResponse(completed1, final AllocateResponse response2 = createAllocateResponse(completed1,

View File

@ -237,9 +237,9 @@ public class BuilderUtils {
return containerStatus; return containerStatus;
} }
public static Container newContainer(ContainerId containerId, public static Container newContainer(ContainerId containerId, NodeId nodeId,
NodeId nodeId, String nodeHttpAddress, String nodeHttpAddress, Resource resource, Priority priority,
Resource resource, Priority priority, ContainerToken containerToken) { ContainerToken containerToken, long rmIdentifier) {
Container container = recordFactory.newRecordInstance(Container.class); Container container = recordFactory.newRecordInstance(Container.class);
container.setId(containerId); container.setId(containerId);
container.setNodeId(nodeId); container.setNodeId(nodeId);
@ -247,6 +247,7 @@ public class BuilderUtils {
container.setResource(resource); container.setResource(resource);
container.setPriority(priority); container.setPriority(priority);
container.setContainerToken(containerToken); container.setContainerToken(containerToken);
container.setRMIdentifier(rmIdentifier);
return container; return container;
} }

View File

@ -105,7 +105,7 @@ public class TestContainerLaunchRPC {
containerId.setId(100); containerId.setId(100);
Container container = Container container =
BuilderUtils.newContainer(containerId, null, null, recordFactory BuilderUtils.newContainer(containerId, null, null, recordFactory
.newRecordInstance(Resource.class), null, null); .newRecordInstance(Resource.class), null, null, 0);
StartContainerRequest scRequest = recordFactory StartContainerRequest scRequest = recordFactory
.newRecordInstance(StartContainerRequest.class); .newRecordInstance(StartContainerRequest.class);

View File

@ -128,7 +128,7 @@ public class TestRPC {
containerId.setId(100); containerId.setId(100);
Container mockContainer = Container mockContainer =
BuilderUtils.newContainer(containerId, null, null, recordFactory BuilderUtils.newContainer(containerId, null, null, recordFactory
.newRecordInstance(Resource.class), null, null); .newRecordInstance(Resource.class), null, null, 0);
// containerLaunchContext.env = new HashMap<CharSequence, CharSequence>(); // containerLaunchContext.env = new HashMap<CharSequence, CharSequence>();
// containerLaunchContext.command = new ArrayList<CharSequence>(); // containerLaunchContext.command = new ArrayList<CharSequence>();

View File

@ -0,0 +1,24 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.api;
public interface ResourceManagerConstants {
public static final long RM_INVALID_IDENTIFIER = 0;
}

View File

@ -30,4 +30,7 @@ public interface RegisterNodeManagerResponse {
void setNodeAction(NodeAction nodeAction); void setNodeAction(NodeAction nodeAction);
long getRMIdentifier();
void setRMIdentifier(long rmIdentifier);
} }

View File

@ -121,6 +121,18 @@ public class RegisterNodeManagerResponsePBImpl extends ProtoBase<RegisterNodeMan
rebuild = true; rebuild = true;
} }
@Override
public long getRMIdentifier() {
RegisterNodeManagerResponseProtoOrBuilder p = viaProto ? proto : builder;
return (p.getRmIdentifier());
}
@Override
public void setRMIdentifier(long rmIdentifier) {
maybeInitBuilder();
builder.setRmIdentifier(rmIdentifier);
}
private NodeAction convertFromProtoFormat(NodeActionProto p) { private NodeAction convertFromProtoFormat(NodeActionProto p) {
return NodeAction.valueOf(p.name()); return NodeAction.valueOf(p.name());
} }

View File

@ -33,6 +33,7 @@ message RegisterNodeManagerRequestProto {
message RegisterNodeManagerResponseProto { message RegisterNodeManagerResponseProto {
optional MasterKeyProto master_key = 1; optional MasterKeyProto master_key = 1;
optional NodeActionProto nodeAction = 2; optional NodeActionProto nodeAction = 2;
optional int64 rm_identifier = 3;
} }
message NodeHeartbeatRequestProto { message NodeHeartbeatRequestProto {

View File

@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
import org.apache.hadoop.yarn.api.ContainerManager;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.NodeHealthStatus; import org.apache.hadoop.yarn.api.records.NodeHealthStatus;
@ -48,4 +49,6 @@ public interface Context {
NMContainerTokenSecretManager getContainerTokenSecretManager(); NMContainerTokenSecretManager getContainerTokenSecretManager();
NodeHealthStatus getNodeHealthStatus(); NodeHealthStatus getNodeHealthStatus();
ContainerManager getContainerManager();
} }

View File

@ -38,6 +38,7 @@ import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.YarnException; import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler; import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.ContainerManager;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.NodeHealthStatus; import org.apache.hadoop.yarn.api.records.NodeHealthStatus;
@ -164,6 +165,7 @@ public class NodeManager extends CompositeService
addService(nodeHealthChecker); addService(nodeHealthChecker);
dirsHandler = nodeHealthChecker.getDiskHandler(); dirsHandler = nodeHealthChecker.getDiskHandler();
nodeStatusUpdater = nodeStatusUpdater =
createNodeStatusUpdater(context, dispatcher, nodeHealthChecker); createNodeStatusUpdater(context, dispatcher, nodeHealthChecker);
@ -174,6 +176,7 @@ public class NodeManager extends CompositeService
createContainerManager(context, exec, del, nodeStatusUpdater, createContainerManager(context, exec, del, nodeStatusUpdater,
this.aclsManager, dirsHandler); this.aclsManager, dirsHandler);
addService(containerManager); addService(containerManager);
((NMContext) context).setContainerManager(containerManager);
Service webServer = createWebServer(context, containerManager Service webServer = createWebServer(context, containerManager
.getContainersMonitor(), this.aclsManager, dirsHandler); .getContainersMonitor(), this.aclsManager, dirsHandler);
@ -221,11 +224,13 @@ public class NodeManager extends CompositeService
DefaultMetricsSystem.shutdown(); DefaultMetricsSystem.shutdown();
} }
protected void cleanupContainersOnResync() { protected void resyncWithRM() {
//we do not want to block dispatcher thread here //we do not want to block dispatcher thread here
new Thread() { new Thread() {
@Override @Override
public void run() { public void run() {
LOG.info("Notifying ContainerManager to block new container-requests");
containerManager.setBlockNewContainerRequests(true);
cleanupContainers(NodeManagerEventType.RESYNC); cleanupContainers(NodeManagerEventType.RESYNC);
((NodeStatusUpdaterImpl) nodeStatusUpdater ).rebootNodeStatusUpdater(); ((NodeStatusUpdaterImpl) nodeStatusUpdater ).rebootNodeStatusUpdater();
} }
@ -296,7 +301,7 @@ public class NodeManager extends CompositeService
new ConcurrentSkipListMap<ContainerId, Container>(); new ConcurrentSkipListMap<ContainerId, Container>();
private final NMContainerTokenSecretManager containerTokenSecretManager; private final NMContainerTokenSecretManager containerTokenSecretManager;
private ContainerManager containerManager;
private final NodeHealthStatus nodeHealthStatus = RecordFactoryProvider private final NodeHealthStatus nodeHealthStatus = RecordFactoryProvider
.getRecordFactory(null).newRecordInstance(NodeHealthStatus.class); .getRecordFactory(null).newRecordInstance(NodeHealthStatus.class);
@ -333,6 +338,15 @@ public class NodeManager extends CompositeService
public NodeHealthStatus getNodeHealthStatus() { public NodeHealthStatus getNodeHealthStatus() {
return this.nodeHealthStatus; return this.nodeHealthStatus;
} }
@Override
public ContainerManager getContainerManager() {
return this.containerManager;
}
public void setContainerManager(ContainerManager containerManager) {
this.containerManager = containerManager;
}
} }
@ -376,7 +390,7 @@ public class NodeManager extends CompositeService
stop(); stop();
break; break;
case RESYNC: case RESYNC:
cleanupContainersOnResync(); resyncWithRM();
break; break;
default: default:
LOG.warn("Invalid shutdown event " + event.getType() + ". Ignoring."); LOG.warn("Invalid shutdown event " + event.getType() + ". Ignoring.");

View File

@ -24,5 +24,8 @@ import org.apache.hadoop.yarn.service.Service;
public interface NodeStatusUpdater extends Service { public interface NodeStatusUpdater extends Service {
void sendOutofBandHeartBeat(); void sendOutofBandHeartBeat();
NodeStatus getNodeStatusAndUpdateContainersInContext(); NodeStatus getNodeStatusAndUpdateContainersInContext();
long getRMIdentifier();
} }

View File

@ -48,6 +48,7 @@ import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
import org.apache.hadoop.yarn.server.api.ResourceTracker; import org.apache.hadoop.yarn.server.api.ResourceTracker;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
@ -56,6 +57,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResp
import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeAction;
import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.NodeStatus;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.service.AbstractService; import org.apache.hadoop.yarn.service.AbstractService;
@ -95,6 +97,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
private Runnable statusUpdaterRunnable; private Runnable statusUpdaterRunnable;
private Thread statusUpdater; private Thread statusUpdater;
private long rmIdentifier = ResourceManagerConstants.RM_INVALID_IDENTIFIER;
public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher, public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) { NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
@ -267,6 +270,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
this.resourceTracker = getRMClient(); this.resourceTracker = getRMClient();
regNMResponse = regNMResponse =
this.resourceTracker.registerNodeManager(request); this.resourceTracker.registerNodeManager(request);
this.rmIdentifier = regNMResponse.getRMIdentifier();
break; break;
} catch(Throwable e) { } catch(Throwable e) {
LOG.warn("Trying to connect to ResourceManager, " + LOG.warn("Trying to connect to ResourceManager, " +
@ -308,7 +312,9 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
LOG.info("Registered with ResourceManager as " + this.nodeId LOG.info("Registered with ResourceManager as " + this.nodeId
+ " with total resource of " + this.totalResource); + " with total resource of " + this.totalResource);
LOG.info("Notifying ContainerManager to unblock new container-requests");
((ContainerManagerImpl) this.context.getContainerManager())
.setBlockNewContainerRequests(false);
} }
private List<ApplicationId> createKeepAliveApplicationList() { private List<ApplicationId> createKeepAliveApplicationList() {
@ -334,6 +340,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
return appList; return appList;
} }
@Override
public NodeStatus getNodeStatusAndUpdateContainersInContext() { public NodeStatus getNodeStatusAndUpdateContainersInContext() {
NodeStatus nodeStatus = recordFactory.newRecordInstance(NodeStatus.class); NodeStatus nodeStatus = recordFactory.newRecordInstance(NodeStatus.class);
@ -407,6 +414,11 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
} }
} }
@Override
public long getRMIdentifier() {
return this.rmIdentifier;
}
protected void startStatusUpdater() { protected void startStatusUpdater() {
statusUpdaterRunnable = new Runnable() { statusUpdaterRunnable = new Runnable() {
@ -478,6 +490,9 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
if (response.getNodeAction() == NodeAction.RESYNC) { if (response.getNodeAction() == NodeAction.RESYNC) {
LOG.info("Node is out of sync with ResourceManager," LOG.info("Node is out of sync with ResourceManager,"
+ " hence rebooting."); + " hence rebooting.");
// Invalidate the RMIdentifier while resync
NodeStatusUpdaterImpl.this.rmIdentifier =
ResourceManagerConstants.RM_INVALID_IDENTIFIER;
dispatcher.getEventHandler().handle( dispatcher.getEventHandler().handle(
new NodeManagerEvent(NodeManagerEventType.RESYNC)); new NodeManagerEvent(NodeManagerEventType.RESYNC));
break; break;

View File

@ -23,10 +23,9 @@ import static org.apache.hadoop.yarn.service.Service.STATE.STARTED;
import java.io.IOException; import java.io.IOException;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -125,6 +124,7 @@ public class ContainerManagerImpl extends CompositeService implements
private final ApplicationACLsManager aclsManager; private final ApplicationACLsManager aclsManager;
private final DeletionService deletionService; private final DeletionService deletionService;
private AtomicBoolean blockNewContainerRequests = new AtomicBoolean(false);
public ContainerManagerImpl(Context context, ContainerExecutor exec, public ContainerManagerImpl(Context context, ContainerExecutor exec,
DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater,
@ -239,7 +239,10 @@ public class ContainerManagerImpl extends CompositeService implements
false)) { false)) {
refreshServiceAcls(conf, new NMPolicyProvider()); refreshServiceAcls(conf, new NMPolicyProvider());
} }
LOG.info("Blocking new container-requests as container manager rpc" +
" server is still starting.");
this.setBlockNewContainerRequests(true);
server.start(); server.start();
InetSocketAddress connectAddress = NetUtils.getConnectAddress(server); InetSocketAddress connectAddress = NetUtils.getConnectAddress(server);
this.context.getNodeId().setHost(connectAddress.getHostName()); this.context.getNodeId().setHost(connectAddress.getHostName());
@ -393,6 +396,13 @@ public class ContainerManagerImpl extends CompositeService implements
@Override @Override
public StartContainerResponse startContainer(StartContainerRequest request) public StartContainerResponse startContainer(StartContainerRequest request)
throws YarnRemoteException { throws YarnRemoteException {
if (blockNewContainerRequests.get()) {
throw RPCUtil.getRemoteException(new NMNotYetReadyException(
"Rejecting new containers as NodeManager has not" +
" yet connected with ResourceManager"));
}
ContainerLaunchContext launchContext = request.getContainerLaunchContext(); ContainerLaunchContext launchContext = request.getContainerLaunchContext();
org.apache.hadoop.yarn.api.records.Container lauchContainer = org.apache.hadoop.yarn.api.records.Container lauchContainer =
request.getContainer(); request.getContainer();
@ -402,6 +412,16 @@ public class ContainerManagerImpl extends CompositeService implements
UserGroupInformation remoteUgi = getRemoteUgi(containerIDStr); UserGroupInformation remoteUgi = getRemoteUgi(containerIDStr);
authorizeRequest(containerIDStr, launchContext, lauchContainer, remoteUgi); authorizeRequest(containerIDStr, launchContext, lauchContainer, remoteUgi);
// Is the container coming from unknown RM
if (lauchContainer.getRMIdentifer() != nodeStatusUpdater
.getRMIdentifier()) {
String msg = "\nContainer "+ containerIDStr
+ " rejected as it is allocated by a previous RM";
LOG.error(msg);
throw RPCUtil
.getRemoteException(new InvalidContainerException(msg));
}
LOG.info("Start request for " + containerIDStr + " by user " LOG.info("Start request for " + containerIDStr + " by user "
+ launchContext.getUser()); + launchContext.getUser());
@ -615,6 +635,10 @@ public class ContainerManagerImpl extends CompositeService implements
} }
} }
public void setBlockNewContainerRequests(boolean blockNewContainerRequests) {
this.blockNewContainerRequests.set(blockNewContainerRequests);
}
@Override @Override
public void stateChanged(Service service) { public void stateChanged(Service service) {
// TODO Auto-generated method stub // TODO Auto-generated method stub

View File

@ -0,0 +1,33 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager;
import org.apache.hadoop.yarn.YarnException;
/**
* This Exception happens when NM is rejecting container requests from RM
*/
public class InvalidContainerException extends YarnException {
private static final long serialVersionUID = 1L;
public InvalidContainerException(String msg) {
super(msg);
}
}

View File

@ -0,0 +1,34 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager;
import org.apache.hadoop.yarn.YarnException;
/**
* This exception happens when NM starts from scratch but has not yet connected
* with RM.
*/
public class NMNotYetReadyException extends YarnException {
private static final long serialVersionUID = 1L;
public NMNotYetReadyException(String msg) {
super(msg);
}
}

View File

@ -168,4 +168,9 @@ public class DummyContainerManager extends ContainerManagerImpl {
} }
}; };
} }
@Override
public void setBlockNewContainerRequests(boolean blockNewContainerRequests) {
// do nothing
}
} }

View File

@ -142,6 +142,17 @@ public class TestContainerManagerWithLCE extends TestContainerManager {
super.testLocalFilesCleanup(); super.testLocalFilesCleanup();
} }
@Override
public void testContainerLaunchFromPreviousRM() throws InterruptedException,
IOException {
// Don't run the test if the binary is not available.
if (!shouldRunTest()) {
LOG.info("LCE binary path is not passed. Not running the test");
return;
}
LOG.info("Running testContainerLaunchFromPreviousRM");
super.testContainerLaunchFromPreviousRM();
}
private boolean shouldRunTest() { private boolean shouldRunTest() {
return System return System
.getProperty(YarnConfiguration.NM_LINUX_CONTAINER_EXECUTOR_PATH) != null; .getProperty(YarnConfiguration.NM_LINUX_CONTAINER_EXECUTOR_PATH) != null;

View File

@ -0,0 +1,315 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.atomic.AtomicBoolean;
import junit.framework.Assert;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.NMNotYetReadyException;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.BuilderUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class TestNodeManagerResync {
static final File basedir =
new File("target", TestNodeManagerResync.class.getName());
static final File tmpDir = new File(basedir, "tmpDir");
static final File logsDir = new File(basedir, "logs");
static final File remoteLogsDir = new File(basedir, "remotelogs");
static final File nmLocalDir = new File(basedir, "nm0");
static final File processStartFile = new File(tmpDir, "start_file.txt")
.getAbsoluteFile();
static final RecordFactory recordFactory = RecordFactoryProvider
.getRecordFactory(null);
static final String user = "nobody";
private FileContext localFS;
private CyclicBarrier syncBarrier;
private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false);
@Before
public void setup() throws UnsupportedFileSystemException {
localFS = FileContext.getLocalFSFileContext();
tmpDir.mkdirs();
logsDir.mkdirs();
remoteLogsDir.mkdirs();
nmLocalDir.mkdirs();
syncBarrier = new CyclicBarrier(2);
}
@After
public void tearDown() throws IOException, InterruptedException {
localFS.delete(new Path(basedir.getPath()), true);
assertionFailedInThread.set(false);
}
@SuppressWarnings("unchecked")
@Test
public void testKillContainersOnResync() throws IOException,
InterruptedException {
NodeManager nm = new TestNodeManager1();
YarnConfiguration conf = createNMConfig();
nm.init(conf);
nm.start();
ContainerId cId = TestNodeManagerShutdown.createContainerId();
TestNodeManagerShutdown.startContainer(nm, cId, localFS, tmpDir,
processStartFile);
Assert.assertEquals(1, ((TestNodeManager1) nm).getNMRegistrationCount());
nm.getNMDispatcher().getEventHandler().
handle( new NodeManagerEvent(NodeManagerEventType.RESYNC));
try {
syncBarrier.await();
} catch (BrokenBarrierException e) {
}
Assert.assertEquals(2, ((TestNodeManager1) nm).getNMRegistrationCount());
Assert.assertFalse(assertionFailedInThread.get());
nm.stop();
}
// This test tests new container requests are blocked when NM starts from
// scratch until it register with RM AND while NM is resyncing with RM
@SuppressWarnings("unchecked")
@Test
public void testBlockNewContainerRequestsOnStartAndResync()
throws IOException, InterruptedException {
NodeManager nm = new TestNodeManager2();
YarnConfiguration conf = createNMConfig();
nm.init(conf);
nm.start();
// Start the container in running state
ContainerId cId = TestNodeManagerShutdown.createContainerId();
TestNodeManagerShutdown.startContainer(nm, cId, localFS, tmpDir,
processStartFile);
nm.getNMDispatcher().getEventHandler()
.handle(new NodeManagerEvent(NodeManagerEventType.RESYNC));
try {
syncBarrier.await();
} catch (BrokenBarrierException e) {
}
Assert.assertFalse(assertionFailedInThread.get());
nm.stop();
}
private YarnConfiguration createNMConfig() {
YarnConfiguration conf = new YarnConfiguration();
conf.setInt(YarnConfiguration.NM_PMEM_MB, 5*1024); // 5GB
conf.set(YarnConfiguration.NM_ADDRESS, "127.0.0.1:12345");
conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, "127.0.0.1:12346");
conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
remoteLogsDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath());
return conf;
}
class TestNodeManager1 extends NodeManager {
private int registrationCount = 0;
@Override
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
return new TestNodeStatusUpdaterImpl1(context, dispatcher,
healthChecker, metrics);
}
public int getNMRegistrationCount() {
return registrationCount;
}
class TestNodeStatusUpdaterImpl1 extends MockNodeStatusUpdater {
public TestNodeStatusUpdaterImpl1(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
super(context, dispatcher, healthChecker, metrics);
}
@Override
protected void registerWithRM() throws YarnRemoteException {
super.registerWithRM();
registrationCount++;
}
@Override
protected void rebootNodeStatusUpdater() {
ConcurrentMap<ContainerId, org.apache.hadoop.yarn.server.nodemanager
.containermanager.container.Container> containers =
getNMContext().getContainers();
try {
// ensure that containers are empty before restart nodeStatusUpdater
Assert.assertTrue(containers.isEmpty());
super.rebootNodeStatusUpdater();
syncBarrier.await();
} catch (InterruptedException e) {
} catch (BrokenBarrierException e) {
} catch (AssertionError ae) {
assertionFailedInThread.set(true);
}
}
}
}
class TestNodeManager2 extends NodeManager {
Thread launchContainersThread = null;
@Override
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
return new TestNodeStatusUpdaterImpl2(context, dispatcher,
healthChecker, metrics);
}
@Override
protected ContainerManagerImpl createContainerManager(Context context,
ContainerExecutor exec, DeletionService del,
NodeStatusUpdater nodeStatusUpdater, ApplicationACLsManager aclsManager,
LocalDirsHandlerService dirsHandler) {
return new ContainerManagerImpl(context, exec, del, nodeStatusUpdater,
metrics, aclsManager, dirsHandler){
@Override
public void setBlockNewContainerRequests(
boolean blockNewContainerRequests) {
if (blockNewContainerRequests) {
// start test thread right after blockNewContainerRequests is set
// true
super.setBlockNewContainerRequests(blockNewContainerRequests);
launchContainersThread = new RejectedContainersLauncherThread();
launchContainersThread.start();
} else {
// join the test thread right before blockNewContainerRequests is
// reset
try {
// stop the test thread
((RejectedContainersLauncherThread) launchContainersThread)
.setStopThreadFlag(true);
launchContainersThread.join();
((RejectedContainersLauncherThread) launchContainersThread)
.setStopThreadFlag(false);
super.setBlockNewContainerRequests(blockNewContainerRequests);
} catch (InterruptedException e) {
}
}
}
};
}
class TestNodeStatusUpdaterImpl2 extends MockNodeStatusUpdater {
public TestNodeStatusUpdaterImpl2(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
super(context, dispatcher, healthChecker, metrics);
}
@Override
protected void rebootNodeStatusUpdater() {
ConcurrentMap<ContainerId, org.apache.hadoop.yarn.server.nodemanager
.containermanager.container.Container> containers =
getNMContext().getContainers();
try {
// ensure that containers are empty before restart nodeStatusUpdater
Assert.assertTrue(containers.isEmpty());
super.rebootNodeStatusUpdater();
// After this point new containers are free to be launched, except
// containers from previous RM
// Wait here so as to sync with the main test thread.
syncBarrier.await();
} catch (InterruptedException e) {
} catch (BrokenBarrierException e) {
} catch (AssertionError ae) {
assertionFailedInThread.set(true);
}
}
}
class RejectedContainersLauncherThread extends Thread {
boolean isStopped = false;
public void setStopThreadFlag(boolean isStopped) {
this.isStopped = isStopped;
}
@Override
public void run() {
int numContainers = 0;
int numContainersRejected = 0;
ContainerLaunchContext containerLaunchContext =
recordFactory.newRecordInstance(ContainerLaunchContext.class);
try {
while (!isStopped && numContainers < 10) {
ContainerId cId = TestNodeManagerShutdown.createContainerId();
Container container =
BuilderUtils.newContainer(cId, null, null, null, null, null, 0);
StartContainerRequest startRequest =
recordFactory.newRecordInstance(StartContainerRequest.class);
startRequest.setContainerLaunchContext(containerLaunchContext);
startRequest.setContainer(container);
System.out.println("no. of containers to be launched: "
+ numContainers);
numContainers++;
try {
getContainerManager().startContainer(startRequest);
} catch (YarnRemoteException e) {
numContainersRejected++;
Assert.assertTrue(e.getMessage().contains(
"Rejecting new containers as NodeManager has not" +
" yet connected with ResourceManager"));
// TO DO: This should be replaced to explicitly check exception
// class name after YARN-142
Assert.assertTrue(e.getRemoteTrace().contains(
NMNotYetReadyException.class.getName()));
}
}
// no. of containers to be launched should equal to no. of
// containers rejected
Assert.assertEquals(numContainers, numContainersRejected);
} catch (AssertionError ae) {
assertionFailedInThread.set(true);
}
}
}
}
}

View File

@ -24,17 +24,12 @@ import static org.mockito.Mockito.when;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CyclicBarrier;
import junit.framework.Assert; import junit.framework.Assert;
@ -59,12 +54,9 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.URL; import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.util.BuilderUtils; import org.apache.hadoop.yarn.util.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.ConverterUtils;
import org.junit.After; import org.junit.After;
@ -86,7 +78,6 @@ public class TestNodeManagerShutdown {
static final String user = "nobody"; static final String user = "nobody";
private FileContext localFS; private FileContext localFS;
private ContainerId cId; private ContainerId cId;
private CyclicBarrier syncBarrier = new CyclicBarrier(2);
@Before @Before
public void setup() throws UnsupportedFileSystemException { public void setup() throws UnsupportedFileSystemException {
@ -110,7 +101,7 @@ public class TestNodeManagerShutdown {
NodeManager nm = getNodeManager(); NodeManager nm = getNodeManager();
nm.init(createNMConfig()); nm.init(createNMConfig());
nm.start(); nm.start();
startContainers(nm); startContainer(nm, cId, localFS, tmpDir, processStartFile);
final int MAX_TRIES=20; final int MAX_TRIES=20;
int numTries = 0; int numTries = 0;
@ -150,29 +141,13 @@ public class TestNodeManagerShutdown {
reader.close(); reader.close();
} }
} }
@SuppressWarnings("unchecked")
@Test
public void testKillContainersOnResync() throws IOException, InterruptedException {
NodeManager nm = new TestNodeManager();
YarnConfiguration conf = createNMConfig();
nm.init(conf);
nm.start();
startContainers(nm);
assert ((TestNodeManager) nm).getNMRegistrationCount() == 1; public static void startContainer(NodeManager nm, ContainerId cId,
nm.getNMDispatcher().getEventHandler(). FileContext localFS, File scriptFileDir, File processStartFile)
handle( new NodeManagerEvent(NodeManagerEventType.RESYNC)); throws IOException {
try {
syncBarrier.await();
} catch (BrokenBarrierException e) {
}
assert ((TestNodeManager) nm).getNMRegistrationCount() == 2;
}
private void startContainers(NodeManager nm) throws IOException {
ContainerManagerImpl containerManager = nm.getContainerManager(); ContainerManagerImpl containerManager = nm.getContainerManager();
File scriptFile = createUnhaltingScriptFile(); File scriptFile =
createUnhaltingScriptFile(cId, scriptFileDir, processStartFile);
ContainerLaunchContext containerLaunchContext = ContainerLaunchContext containerLaunchContext =
recordFactory.newRecordInstance(ContainerLaunchContext.class); recordFactory.newRecordInstance(ContainerLaunchContext.class);
@ -218,7 +193,7 @@ public class TestNodeManagerShutdown {
Assert.assertEquals(ContainerState.RUNNING, containerStatus.getState()); Assert.assertEquals(ContainerState.RUNNING, containerStatus.getState());
} }
private ContainerId createContainerId() { public static ContainerId createContainerId() {
ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class); ApplicationId appId = recordFactory.newRecordInstance(ApplicationId.class);
appId.setClusterTimestamp(0); appId.setClusterTimestamp(0);
appId.setId(0); appId.setId(0);
@ -247,8 +222,9 @@ public class TestNodeManagerShutdown {
* Creates a script to run a container that will run forever unless * Creates a script to run a container that will run forever unless
* stopped by external means. * stopped by external means.
*/ */
private File createUnhaltingScriptFile() throws IOException { private static File createUnhaltingScriptFile(ContainerId cId,
File scriptFile = Shell.appendScriptExtension(tmpDir, "scriptFile"); File scriptFileDir, File processStartFile) throws IOException {
File scriptFile = Shell.appendScriptExtension(scriptFileDir, "scriptFile");
PrintWriter fileWriter = new PrintWriter(scriptFile); PrintWriter fileWriter = new PrintWriter(scriptFile);
if (Shell.WINDOWS) { if (Shell.WINDOWS) {
fileWriter.println("@echo \"Running testscript for delayed kill\""); fileWriter.println("@echo \"Running testscript for delayed kill\"");
@ -282,48 +258,4 @@ public class TestNodeManagerShutdown {
} }
}; };
} }
class TestNodeManager extends NodeManager {
private int registrationCount = 0;
@Override
protected NodeStatusUpdater createNodeStatusUpdater(Context context,
Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
return new TestNodeStatusUpdaterImpl(context, dispatcher,
healthChecker, metrics);
}
public int getNMRegistrationCount() {
return registrationCount;
}
class TestNodeStatusUpdaterImpl extends MockNodeStatusUpdater {
public TestNodeStatusUpdaterImpl(Context context, Dispatcher dispatcher,
NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
super(context, dispatcher, healthChecker, metrics);
}
@Override
protected void registerWithRM() throws YarnRemoteException {
super.registerWithRM();
registrationCount++;
}
@Override
protected void rebootNodeStatusUpdater() {
ConcurrentMap<ContainerId, org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container> containers =
getNMContext().getContainers();
// ensure that containers are empty before restart nodeStatusUpdater
Assert.assertTrue(containers.isEmpty());
super.rebootNodeStatusUpdater();
try {
syncBarrier.await();
} catch (InterruptedException e) {
} catch (BrokenBarrierException e) {
}
}
}
}
} }

View File

@ -42,6 +42,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.yarn.YarnException; import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.api.ContainerManager;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;

View File

@ -156,7 +156,13 @@ public abstract class BaseContainerManagerTest {
dirsHandler = nodeHealthChecker.getDiskHandler(); dirsHandler = nodeHealthChecker.getDiskHandler();
containerManager = containerManager =
new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater, new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater,
metrics, new ApplicationACLsManager(conf), dirsHandler); metrics, new ApplicationACLsManager(conf), dirsHandler) {
@Override
public void setBlockNewContainerRequests(
boolean blockNewContainerRequests) {
// do nothing
}
};
containerManager.init(conf); containerManager.init(conf);
} }

View File

@ -18,6 +18,9 @@
package org.apache.hadoop.yarn.server.nodemanager.containermanager; package org.apache.hadoop.yarn.server.nodemanager.containermanager;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileReader; import java.io.FileReader;
@ -49,13 +52,18 @@ import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.URL; import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException; import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent; import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.ExitCode;
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor.Signal;
import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.LocalRMInterface;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
@ -63,7 +71,6 @@ import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.BuilderUtils; import org.apache.hadoop.yarn.util.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.ConverterUtils;
import org.junit.Test; import org.junit.Test;
import static org.mockito.Mockito.*;
public class TestContainerManager extends BaseContainerManagerTest { public class TestContainerManager extends BaseContainerManagerTest {
@ -411,7 +418,13 @@ public class TestContainerManager extends BaseContainerManagerTest {
containerManager = containerManager =
new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater, new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater,
metrics, new ApplicationACLsManager(conf), dirsHandler); metrics, new ApplicationACLsManager(conf), dirsHandler) {
@Override
public void setBlockNewContainerRequests(
boolean blockNewContainerRequests) {
// do nothing
}
};
containerManager.init(conf); containerManager.init(conf);
containerManager.start(); containerManager.start();
@ -524,4 +537,77 @@ public class TestContainerManager extends BaseContainerManagerTest {
Assert.assertFalse(targetFile.getAbsolutePath() + " exists!!", Assert.assertFalse(targetFile.getAbsolutePath() + " exists!!",
targetFile.exists()); targetFile.exists());
} }
@Test
public void testContainerLaunchFromPreviousRM() throws IOException,
InterruptedException {
// There is no real RM registration, simulate and set RMIdentifier
NodeStatusUpdater nodeStatusUpdater = mock(NodeStatusUpdater.class);
when(nodeStatusUpdater.getRMIdentifier()).thenReturn((long) 1234);
containerManager =
new ContainerManagerImpl(context, exec, delSrvc, nodeStatusUpdater,
metrics, new ApplicationACLsManager(conf), dirsHandler) {
@Override
public void setBlockNewContainerRequests(
boolean blockNewContainerRequests) {
// do nothing
}
};
containerManager.init(conf);
containerManager.start();
ContainerLaunchContext containerLaunchContext =
recordFactory.newRecordInstance(ContainerLaunchContext.class);
ContainerId cId1 = createContainerId();
ContainerId cId2 = createContainerId();
containerLaunchContext.setUser(user);
containerLaunchContext
.setLocalResources(new HashMap<String, LocalResource>());
containerLaunchContext.setUser(containerLaunchContext.getUser());
Resource mockResource = mock(Resource.class);
Container mockContainer1 = mock(Container.class);
when(mockContainer1.getId()).thenReturn(cId1);
// Construct the Container with Invalid RMIdentifier
when(mockContainer1.getRMIdentifer()).thenReturn(
(long) ResourceManagerConstants.RM_INVALID_IDENTIFIER);
StartContainerRequest startRequest1 =
recordFactory.newRecordInstance(StartContainerRequest.class);
startRequest1.setContainerLaunchContext(containerLaunchContext);
startRequest1.setContainer(mockContainer1);
boolean catchException = false;
try {
containerManager.startContainer(startRequest1);
} catch (YarnRemoteException e) {
catchException = true;
Assert.assertTrue(e.getMessage().contains(
"Container " + cId1 + " rejected as it is allocated by a previous RM"));
// TO DO: This should be replaced to explicitly check exception
// class name after YARN-142
Assert.assertTrue(e.getRemoteTrace().contains(
InvalidContainerException.class.getName()));
}
// Verify that startContainer fail because of invalid container request
Assert.assertTrue(catchException);
// Construct the Container with a RMIdentifier within current RM
Container mockContainer2 = mock(Container.class);
when(mockContainer2.getId()).thenReturn(cId2);
when(mockContainer2.getRMIdentifer()).thenReturn((long) 1234);
when(mockContainer2.getResource()).thenReturn(mockResource);
StartContainerRequest startRequest2 =
recordFactory.newRecordInstance(StartContainerRequest.class);
startRequest2.setContainerLaunchContext(containerLaunchContext);
startRequest2.setContainer(mockContainer2);
boolean noException = true;
try {
containerManager.startContainer(startRequest2);
} catch (YarnRemoteException e) {
noException = false;
}
// Verify that startContainer get no YarnRemoteException
Assert.assertTrue(noException);
}
} }

View File

@ -196,6 +196,7 @@ public class ResourceTrackerService extends AbstractService implements
+ capability + ", assigned nodeId " + nodeId); + capability + ", assigned nodeId " + nodeId);
response.setNodeAction(NodeAction.NORMAL); response.setNodeAction(NodeAction.NORMAL);
response.setRMIdentifier(ResourceManager.clusterTimeStamp);
return response; return response;
} }

View File

@ -33,11 +33,11 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationAttemptStateDataPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationStateDataPBImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationAttemptStateDataProto; import org.apache.hadoop.yarn.proto.YarnServerResourceManagerServiceProtos.ApplicationAttemptStateDataProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationStateDataProto; import org.apache.hadoop.yarn.proto.YarnServerResourceManagerServiceProtos.ApplicationStateDataProto;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateDataPBImpl;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateDataPBImpl;
import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.ConverterUtils;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;

View File

@ -25,8 +25,8 @@ import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationAttemptStateDataPBImpl; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateDataPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationStateDataPBImpl; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateDataPBImpl;
import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.ConverterUtils;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;

View File

@ -20,8 +20,8 @@ package org.apache.hadoop.yarn.server.resourcemanager.recovery;
import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationAttemptStateDataPBImpl; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateDataPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationStateDataPBImpl; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateDataPBImpl;
@Unstable @Unstable
public class NullRMStateStore extends RMStateStore { public class NullRMStateStore extends RMStateStore {

View File

@ -30,12 +30,12 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationAttemptStateDataPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationStateDataPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateDataPBImpl;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateDataPBImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppStoredEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppStoredEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;

View File

@ -16,10 +16,12 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.yarn.api.records; package org.apache.hadoop.yarn.server.resourcemanager.recovery.records;
import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.Container;
/* /*
* Contains the state data that needs to be persisted for an ApplicationAttempt * Contains the state data that needs to be persisted for an ApplicationAttempt

View File

@ -16,14 +16,15 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.yarn.api.records.impl.pb; package org.apache.hadoop.yarn.server.resourcemanager.recovery.records;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptStateData;
import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ProtoBase; import org.apache.hadoop.yarn.api.records.ProtoBase;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationAttemptStateDataProto; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationAttemptIdPBImpl;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationAttemptStateDataProtoOrBuilder; import org.apache.hadoop.yarn.api.records.impl.pb.ContainerPBImpl;
import org.apache.hadoop.yarn.proto.YarnServerResourceManagerServiceProtos.ApplicationAttemptStateDataProto;
import org.apache.hadoop.yarn.proto.YarnServerResourceManagerServiceProtos.ApplicationAttemptStateDataProtoOrBuilder;
public class ApplicationAttemptStateDataPBImpl public class ApplicationAttemptStateDataPBImpl
extends ProtoBase<ApplicationAttemptStateDataProto> extends ProtoBase<ApplicationAttemptStateDataProto>

View File

@ -16,10 +16,12 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.yarn.api.records; package org.apache.hadoop.yarn.server.resourcemanager.recovery.records;
import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
/** /**
* Contains all the state data that needs to be stored persistently * Contains all the state data that needs to be stored persistently

View File

@ -16,13 +16,13 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.yarn.api.records.impl.pb; package org.apache.hadoop.yarn.server.resourcemanager.recovery.records;
import org.apache.hadoop.yarn.api.records.ApplicationStateData;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ProtoBase; import org.apache.hadoop.yarn.api.records.ProtoBase;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationStateDataProto; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationStateDataProtoOrBuilder; import org.apache.hadoop.yarn.proto.YarnServerResourceManagerServiceProtos.ApplicationStateDataProto;
import org.apache.hadoop.yarn.proto.YarnServerResourceManagerServiceProtos.ApplicationStateDataProtoOrBuilder;
public class ApplicationStateDataPBImpl public class ApplicationStateDataPBImpl
extends ProtoBase<ApplicationStateDataProto> extends ProtoBase<ApplicationStateDataProto>

View File

@ -53,6 +53,7 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.resource.ResourceCalculator; import org.apache.hadoop.yarn.server.resourcemanager.resource.ResourceCalculator;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources; import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
@ -1243,7 +1244,7 @@ public class LeafQueue implements CSQueue {
// Create the container // Create the container
Container container = BuilderUtils.newContainer(containerId, nodeId, Container container = BuilderUtils.newContainer(containerId, nodeId,
node.getRMNode().getHttpAddress(), capability, priority, node.getRMNode().getHttpAddress(), capability, priority,
null); null, ResourceManager.clusterTimeStamp);
return container; return container;
} }

View File

@ -36,6 +36,7 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.resourcemanager.resource.DefaultResourceCalculator; import org.apache.hadoop.yarn.server.resourcemanager.resource.DefaultResourceCalculator;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources; import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
@ -173,7 +174,7 @@ public class AppSchedulable extends Schedulable {
// Create the container // Create the container
Container container = BuilderUtils.newContainer(containerId, nodeId, Container container = BuilderUtils.newContainer(containerId, nodeId,
node.getRMNode().getHttpAddress(), capability, priority, node.getRMNode().getHttpAddress(), capability, priority,
containerToken); containerToken, ResourceManager.clusterTimeStamp);
return container; return container;
} }
@ -371,4 +372,4 @@ public class AppSchedulable extends Schedulable {
Resources.lessThanOrEqual(RESOURCE_CALCULATOR, null, Resources.lessThanOrEqual(RESOURCE_CALCULATOR, null,
request.getCapability(), node.getRMNode().getTotalCapability()); request.getCapability(), node.getRMNode().getTotalCapability());
} }
} }

View File

@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
import org.apache.hadoop.yarn.server.resourcemanager.resource.DefaultResourceCalculator; import org.apache.hadoop.yarn.server.resourcemanager.resource.DefaultResourceCalculator;
import org.apache.hadoop.yarn.server.resourcemanager.resource.ResourceCalculator; import org.apache.hadoop.yarn.server.resourcemanager.resource.ResourceCalculator;
@ -565,7 +566,7 @@ public class FifoScheduler implements ResourceScheduler, Configurable {
// Create the container // Create the container
Container container = BuilderUtils.newContainer(containerId, nodeId, Container container = BuilderUtils.newContainer(containerId, nodeId,
node.getRMNode().getHttpAddress(), capability, priority, node.getRMNode().getHttpAddress(), capability, priority,
containerToken); containerToken, ResourceManager.clusterTimeStamp);
// Allocate! // Allocate!

View File

@ -188,6 +188,7 @@ public class NodeManager implements ContainerManager {
this.nodeId, nodeHttpAddress, this.nodeId, nodeHttpAddress,
requestContainer.getResource(), requestContainer.getResource(),
null, null // DKDC - Doesn't matter null, null // DKDC - Doesn't matter
, 0
); );
ContainerStatus containerStatus = ContainerStatus containerStatus =

View File

@ -36,6 +36,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.DrainDispatcher; import org.apache.hadoop.yarn.event.DrainDispatcher;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
@ -267,6 +268,21 @@ public class TestResourceTrackerService {
Assert.assertEquals(NodeAction.SHUTDOWN,response.getNodeAction()); Assert.assertEquals(NodeAction.SHUTDOWN,response.getNodeAction());
} }
@Test
public void testSetRMIdentifierInRegistration() throws Exception {
Configuration conf = new Configuration();
rm = new MockRM(conf);
rm.start();
MockNM nm = new MockNM("host1:1234", 5120, rm.getResourceTrackerService());
RegisterNodeManagerResponse response = nm.registerNode();
// Verify the RMIdentifier is correctly set in RegisterNodeManagerResponse
Assert.assertEquals(ResourceManager.clusterTimeStamp,
response.getRMIdentifier());
}
@Test @Test
public void testReboot() throws Exception { public void testReboot() throws Exception {
Configuration conf = new Configuration(); Configuration conf = new Configuration();

View File

@ -0,0 +1,84 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager.applicationmasterservice;
import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
import org.apache.hadoop.yarn.server.resourcemanager.TestFifoScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestApplicationMasterService {
private static final Log LOG = LogFactory.getLog(TestFifoScheduler.class);
private final int GB = 1024;
private static YarnConfiguration conf;
@BeforeClass
public static void setup() {
conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, FifoScheduler.class,
ResourceScheduler.class);
}
@Test(timeout = 30000)
public void testRMIdentifierOnContainerAllocation() throws Exception {
MockRM rm = new MockRM(conf);
rm.start();
// Register node1
MockNM nm1 = rm.registerNode("h1:1234", 6 * GB);
// Submit an application
RMApp app1 = rm.submitApp(2048);
// kick the scheduling
nm1.nodeHeartbeat(true);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
am1.addRequests(new String[] { "h1" }, GB, 1, 1);
AllocateResponse alloc1Response = am1.schedule(); // send the request
// kick the scheduler
nm1.nodeHeartbeat(true);
while (alloc1Response.getAllocatedContainers().size() < 1) {
LOG.info("Waiting for containers to be created for app 1...");
Thread.sleep(1000);
alloc1Response = am1.schedule();
}
// assert RMIdentifer is set properly in allocated containers
Assert.assertEquals(rm.clusterTimeStamp, alloc1Response
.getAllocatedContainers().get(0).getRMIdentifer());
rm.stop();
}
}

View File

@ -69,7 +69,7 @@ public class TestRMContainerImpl {
Priority priority = BuilderUtils.newPriority(5); Priority priority = BuilderUtils.newPriority(5);
Container container = BuilderUtils.newContainer(containerId, nodeId, Container container = BuilderUtils.newContainer(containerId, nodeId,
"host:3465", resource, priority, null); "host:3465", resource, priority, null, 0);
RMContainer rmContainer = new RMContainerImpl(container, appAttemptId, RMContainer rmContainer = new RMContainerImpl(container, appAttemptId,
nodeId, eventHandler, expirer); nodeId, eventHandler, expirer);
@ -139,7 +139,7 @@ public class TestRMContainerImpl {
Priority priority = BuilderUtils.newPriority(5); Priority priority = BuilderUtils.newPriority(5);
Container container = BuilderUtils.newContainer(containerId, nodeId, Container container = BuilderUtils.newContainer(containerId, nodeId,
"host:3465", resource, priority, null); "host:3465", resource, priority, null, 0);
RMContainer rmContainer = new RMContainerImpl(container, appAttemptId, RMContainer rmContainer = new RMContainerImpl(container, appAttemptId,
nodeId, eventHandler, expirer); nodeId, eventHandler, expirer);

View File

@ -360,7 +360,7 @@ public class TestContainerManagerSecurity {
Container container = Container container =
BuilderUtils.newContainer(newTokenId.getContainerID(), null, null, BuilderUtils.newContainer(newTokenId.getContainerID(), null, null,
BuilderUtils.newResource(newTokenId.getResource().getMemory(), BuilderUtils.newResource(newTokenId.getResource().getMemory(),
newTokenId.getResource().getVirtualCores()), null, null); newTokenId.getResource().getVirtualCores()), null, null, 0);
StartContainerRequest request = Records.newRecord(StartContainerRequest.class); StartContainerRequest request = Records.newRecord(StartContainerRequest.class);
request.setContainerLaunchContext(context); request.setContainerLaunchContext(context);
request.setContainer(container); request.setContainer(container);
@ -547,7 +547,7 @@ public class TestContainerManagerSecurity {
createContainerLaunchContextForTest(tokenId); createContainerLaunchContextForTest(tokenId);
Container container = Container container =
BuilderUtils.newContainer(tokenId.getContainerID(), null, null, BuilderUtils.newContainer(tokenId.getContainerID(), null, null,
BuilderUtils.newResource(2048, 1), null, null); BuilderUtils.newResource(2048, 1), null, null, 0);
request.setContainerLaunchContext(context); request.setContainerLaunchContext(context);
request.setContainer(container); request.setContainer(container);
try { try {
@ -575,7 +575,7 @@ public class TestContainerManagerSecurity {
Container container = Container container =
BuilderUtils.newContainer(tokenId.getContainerID(), null, null, BuilderUtils.newContainer(tokenId.getContainerID(), null, null,
BuilderUtils.newResource(tokenId.getResource().getMemory(), tokenId BuilderUtils.newResource(tokenId.getResource().getMemory(), tokenId
.getResource().getVirtualCores()), null, null); .getResource().getVirtualCores()), null, null, 0);
request.setContainerLaunchContext(context); request.setContainerLaunchContext(context);
request.setContainer(container); request.setContainer(container);
try { try {

View File

@ -0,0 +1,52 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.webproxy;
import static org.junit.Assert.assertEquals;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServer;
import org.apache.hadoop.yarn.service.Service.STATE;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class TestWebAppProxyServer {
private WebAppProxyServer webAppProxy = null;
@Before
public void setUp() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.set(YarnConfiguration.PROXY_ADDRESS, "0.0.0.0:8888");
webAppProxy = new WebAppProxyServer();
webAppProxy.init(conf);
}
@After
public void tearDown() throws Exception {
webAppProxy.stop();
}
@Test
public void testStart() {
assertEquals(STATE.INITED, webAppProxy.getServiceState());
webAppProxy.start();
assertEquals(STATE.STARTED, webAppProxy.getServiceState());
}
}