Merge branch 'trunk' into HDFS-6581

Conflicts: hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/StorageType.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsFileStatus.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsLocatedFileStatus.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatPBINode.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileAttributes.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeLayoutVersion.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/web/JsonUtil.java hadoop-hdfs-project/hadoop-hdfs/src/main/proto/fsimage.proto hadoop-hdfs-project/hadoop-hdfs/src/main/proto/hdfs.proto hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFsck.java
2014-09-18 22:26:42 -07:00 · 2014-09-18 22:26:42 -07:00 · f8bbf80067
parent e79c98c11f 6434572297
commit f8bbf80067
109 changed files with 6615 additions and 1053 deletions
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@ -834,6 +834,10 @@ Release 2.6.0 - UNRELEASED
    HADOOP-11105. MetricsSystemImpl could leak memory in registered callbacks.
    (Chuan Liu via cnauroth)

+    HADOOP-10982. KMS: Support for multiple Kerberos principals. (tucu)
+
+    HADOOP-11109. Site build is broken. (Jian He via atm)
+
 Release 2.5.1 - 2014-09-05

  INCOMPATIBLE CHANGES
--- a/hadoop-common-project/hadoop-common/src/main/conf/hadoop-env.sh
+++ b/hadoop-common-project/hadoop-common/src/main/conf/hadoop-env.sh
@ -328,6 +328,15 @@ esac
 #
 # export HADOOP_BALANCER_OPTS=""

+###
+# HDFS Mover specific parameters
+###
+# Specify the JVM options to be used when starting the HDFS Mover.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HADOOP_MOVER_OPTS=""
+
 ###
 # Advanced Users Only!
 ###
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/KMSClientProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/KMSClientProvider.java
@ -45,6 +45,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
+import java.lang.reflect.UndeclaredThrowableException;
 import java.net.HttpURLConnection;
 import java.net.SocketTimeoutException;
 import java.net.URI;
@ -400,6 +401,8 @@ public class KMSClientProvider extends KeyProvider implements CryptoExtension,
      });
    } catch (IOException ex) {
      throw ex;
+    } catch (UndeclaredThrowableException ex) {
+      throw new IOException(ex.getUndeclaredThrowable());
    } catch (Exception ex) {
      throw new IOException(ex);
    }
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java
@ -26,6 +26,7 @@ import java.util.TreeMap;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;

+import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
@ -689,6 +690,12 @@ public class NetworkTopology {
    return rand;
  }

+  @VisibleForTesting
+  void setRandomSeed(long seed) {
+    Random rand = getRandom();
+    rand.setSeed(seed);
+  }
+
  /** randomly choose one node from <i>scope</i>
   * if scope starts with ~, choose one from the all nodes except for the
   * ones in <i>scope</i>; otherwise, choose one from <i>scope</i>
@ -870,21 +877,19 @@ public class NetworkTopology {
  /**
   * Sort nodes array by network distance to <i>reader</i>.
   * <p/>
-   * In a three-level topology, a node can be either local, on the same rack, or
-   * on a different rack from the reader. Sorting the nodes based on network
-   * distance from the reader reduces network traffic and improves performance.
+   * In a three-level topology, a node can be either local, on the same rack,
+   * or on a different rack from the reader. Sorting the nodes based on network
+   * distance from the reader reduces network traffic and improves
+   * performance.
   * <p/>
   * As an additional twist, we also randomize the nodes at each network
-   * distance using the provided random seed. This helps with load balancing
-   * when there is data skew.
-   * 
-   * @param reader Node where data will be read
-   * @param nodes Available replicas with the requested data
-   * @param seed Used to seed the pseudo-random generator that randomizes the
-   *          set of nodes at each network distance.
+   * distance. This helps with load balancing when there is data skew.
+   *
+   * @param reader    Node where data will be read
+   * @param nodes     Available replicas with the requested data
+   * @param activeLen Number of active nodes at the front of the array
   */
-  public void sortByDistance(Node reader, Node[] nodes, int activeLen,
-      long seed, boolean randomizeBlockLocationsPerBlock) {
+  public void sortByDistance(Node reader, Node[] nodes, int activeLen) {
    /** Sort weights for the nodes array */
    int[] weights = new int[activeLen];
    for (int i=0; i<activeLen; i++) {
@ -903,14 +908,7 @@ public class NetworkTopology {
      list.add(node);
    }

-    // Seed is normally the block id
-    // This means we use the same pseudo-random order for each block, for
-    // potentially better page cache usage.
-    // Seed is not used if we want to randomize block location for every block
    Random rand = getRandom();
-    if (!randomizeBlockLocationsPerBlock) {
-      rand.setSeed(seed);
-    }
    int idx = 0;
    for (List<Node> list: tree.values()) {
      if (list != null) {
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java
@ -268,19 +268,17 @@ public class NetworkTopologyWithNodeGroup extends NetworkTopology {
  /**
   * Sort nodes array by their distances to <i>reader</i>.
   * <p/>
-   * This is the same as
-   * {@link NetworkTopology#sortByDistance(Node, Node[], long)} except with a
-   * four-level network topology which contains the additional network distance
-   * of a "node group" which is between local and same rack.
-   * 
-   * @param reader Node where data will be read
-   * @param nodes Available replicas with the requested data
-   * @param seed Used to seed the pseudo-random generator that randomizes the
-   *          set of nodes at each network distance.
+   * This is the same as {@link NetworkTopology#sortByDistance(Node, Node[],
+   * int)} except with a four-level network topology which contains the
+   * additional network distance of a "node group" which is between local and
+   * same rack.
+   *
+   * @param reader    Node where data will be read
+   * @param nodes     Available replicas with the requested data
+   * @param activeLen Number of active nodes at the front of the array
   */
  @Override
-  public void sortByDistance(Node reader, Node[] nodes, int activeLen,
-      long seed, boolean randomizeBlockLocationsPerBlock) {
+  public void sortByDistance(Node reader, Node[] nodes, int activeLen) {
    // If reader is not a datanode (not in NetworkTopology tree), we need to
    // replace this reader with a sibling leaf node in tree.
    if (reader != null && !this.contains(reader)) {
@ -293,8 +291,7 @@ public class NetworkTopologyWithNodeGroup extends NetworkTopology {
        return;
      }
    }
-    super.sortByDistance(reader, nodes, activeLen, seed,
-        randomizeBlockLocationsPerBlock);
+    super.sortByDistance(reader, nodes, activeLen);
  }

  /** InnerNodeWithNodeGroup represents a switch/router of a data center, rack
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java
@ -104,8 +104,7 @@ public class TestNetworkTopologyWithNodeGroup {
    testNodes[1] = dataNodes[2];
    testNodes[2] = dataNodes[3];
    testNodes[3] = dataNodes[0];
-    cluster.sortByDistance(dataNodes[0], testNodes,
-        testNodes.length, 0xDEADBEEF, false);
+    cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length);
    assertTrue(testNodes[0] == dataNodes[0]);
    assertTrue(testNodes[1] == dataNodes[1]);
    assertTrue(testNodes[2] == dataNodes[2]);
@ -116,8 +115,7 @@ public class TestNetworkTopologyWithNodeGroup {
    testNodes[1] = dataNodes[4];
    testNodes[2] = dataNodes[1];
    testNodes[3] = dataNodes[0];
-    cluster.sortByDistance(dataNodes[0], testNodes,
-        testNodes.length, 0xDEADBEEF, false);
+    cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length);
    assertTrue(testNodes[0] == dataNodes[0]);
    assertTrue(testNodes[1] == dataNodes[1]);

@ -126,8 +124,7 @@ public class TestNetworkTopologyWithNodeGroup {
    testNodes[1] = dataNodes[3];
    testNodes[2] = dataNodes[2];
    testNodes[3] = dataNodes[0];
-    cluster.sortByDistance(dataNodes[0], testNodes,
-        testNodes.length, 0xDEADBEEF, false);
+    cluster.sortByDistance(dataNodes[0], testNodes, testNodes.length);
    assertTrue(testNodes[0] == dataNodes[0]);
    assertTrue(testNodes[1] == dataNodes[2]);

@ -136,8 +133,7 @@ public class TestNetworkTopologyWithNodeGroup {
    testNodes[1] = dataNodes[7];
    testNodes[2] = dataNodes[2];
    testNodes[3] = dataNodes[0];
-    cluster.sortByDistance(computeNode, testNodes,
-        testNodes.length, 0xDEADBEEF, false);
+    cluster.sortByDistance(computeNode, testNodes, testNodes.length);
    assertTrue(testNodes[0] == dataNodes[0]);
    assertTrue(testNodes[1] == dataNodes[2]);
  }
--- a/hadoop-common-project/hadoop-kms/src/site/apt/index.apt.vm
+++ b/hadoop-common-project/hadoop-kms/src/site/apt/index.apt.vm
@ -602,7 +602,31 @@ $ keytool -genkey -alias tomcat -keyalg RSA

 *** HTTP Kerberos Principals Configuration

-  TBD
+  When KMS instances are behind a load-balancer or VIP, clients will use the
+  hostname of the VIP. For Kerberos SPNEGO authentication, the hostname of the
+  URL is used to construct the Kerberos service name of the server,
+  <<<HTTP/#HOSTNAME#>>>. This means that all KMS instances must have a Kerberos
+  service name with the load-balancer or VIP hostname.
+
+  In order to be able to access directly a specific KMS instance, the KMS
+  instance must also have Keberos service name with its own hostname. This is
+  required for monitoring and admin purposes.
+
+  Both Kerberos service principal credentials (for the load-balancer/VIP
+  hostname and for the actual KMS instance hostname) must be in the keytab file
+  configured for authentication. And the principal name specified in the
+  configuration must be '*'. For example:
+
+---+
+  <property>
+    <name>hadoop.kms.authentication.kerberos.principal</name>
+    <value>*</value>
+  </property>
+---+
+
+  <<NOTE:>> If using HTTPS, the SSL certificate used by the KMS instance must
+  be configured to support multiple hostnames (see Java 7
+  <<<keytool>>> SAN extension support for details on how to do this).

 *** HTTP Authentication Signature

--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@ -18,6 +18,9 @@ Trunk (Unreleased)

    HDFS-3125. Add JournalService to enable Journal Daemon. (suresh)

+    HDFS-6584. Support archival storage.  (See breakdown of tasks below for
+    features and contributors)
+
  IMPROVEMENTS

    HDFS-4665. Move TestNetworkTopologyWithNodeGroup to common.
@ -261,6 +264,82 @@ Trunk (Unreleased)

    HDFS-6981. Fix DN upgrade with layout version change. (Arpit Agarwal)

+  BREAKDOWN OF HDFS-6584 ARCHIVAL STORAGE
+
+    HDFS-6677. Change INodeFile and FSImage to support storage policy ID.
+    (szetszwo)
+
+    HDFS-6670. Add block storage policy support with default HOT, WARM and COLD
+    policies.  (szetszwo)
+
+    HDFS-6671. Change BlockPlacementPolicy to consider block storage policy
+    in replicaiton.  (szetszwo)
+
+    HDFS-6710. Change BlockPlacementPolicy to consider block storage policy
+    in replica deletion.  (szetszwo)
+
+    HDFS-6679. Bump NameNodeLayoutVersion and update editsStored test files.
+    (vinayakumarb via szetszwo)
+
+    HDFS-6686. Change BlockPlacementPolicy to use fallback when some storage
+    types are unavailable.  (szetszwo)
+
+    HDFS-6835. Add a new API to set storage policy.  (jing9) 
+
+    HDFS-6847. Support storage policy on directories and include storage policy 
+    in HdfsFileStatus.  (Jing Zhao via szetszwo)
+
+    HDFS-6801. Add a new data migration tool, Mover, for archiving data.
+    (szetszwo via jing9)
+
+    HDFS-6863. Support migration for snapshot paths. (jing9)
+
+    HDFS-6906. Add more tests for BlockStoragePolicy.  (szetszwo via jing9)
+
+    HDFS-6911. check if a block is already scheduled in Mover. 
+    (szetszwo via jing9)
+
+    HDFS-6920. Check the storage type of delNodeHintStorage when deleting
+    a replica.  (szetszwo via jing9)
+
+    HDFS-6944. Add retry and termination logic for Mover. (jing9)
+
+    HDFS-6969. INode#getStoragePolicyID should always return the latest
+    storage policy.  (jing9)
+
+    HDFS-6961. BlockPlacementPolicy#chooseTarget should check each valid
+    storage type in each choosing round.  (jing9)
+
+    HDFS-6876. support set/get storage policy in DFSAdmin. (jing9)
+
+    HDFS-6997. Add more tests for data migration and replicaion. (szetszwo)
+
+    HDFS-6875. Support migration for a list of specified paths. (jing9)
+
+    HDFS-7027. Mover does not terminate when some storage type is out of space.
+    (szetszwo via jing9)
+
+    HDFS-7029. Fix TestDFSInotifyEventInputStream and TestDistributedFileSystem.
+    (szetszwo via jing9)
+
+    HDFS-7028. FSDirectory should not get storage policy id from symlinks.
+    (szetszwo)
+
+    HDFS-7034. Fix TestBlockPlacement and TestStorageMover. (jing9)
+
+    HDFS-7039. Fix Balancer tests.  (szetszwo via jing9)
+
+    HDFS-7062. Skip under construction block for migration. (jing9)
+
+    HDFS-7052. Add Mover into hdfs script. (jing9)
+
+    HDFS-7072. Fix TestBlockManager and TestStorageMover.  (jing9 via szetszwo)
+
+    HDFS-6864. Archival Storage: add user documentation. (szetszwo via jing9)
+
+    HDFS-7088. Archival Storage: fix TestBalancer and
+    TestBalancerWithMultipleNameNodes. (szetszwo via jing9)
+
 Release 2.6.0 - UNRELEASED

  INCOMPATIBLE CHANGES
@ -480,6 +559,11 @@ Release 2.6.0 - UNRELEASED
    HDFS-7003. Add NFS Gateway support for reading and writing to
    encryption zones. (clamb via wang)

+    HDFS-6727. Refresh data volumes on DataNode based on configuration changes
+    (Lei Xu via cmccabe)
+
+    HDFS-6970. Move startFile EDEK retries to the DFSClient. (wang)
+
  OPTIMIZATIONS

    HDFS-6690. Deduplicate xattr names in memory. (wang)
@ -687,6 +771,9 @@ Release 2.6.0 - UNRELEASED

    HDFS-7078. Fix listEZs to work correctly with snapshots. (wang)

+    HDFS-6840. Clients are always sent to the same datanode when read
+    is off rack. (wang)
+
    BREAKDOWN OF HDFS-6134 AND HADOOP-10150 SUBTASKS AND RELATED JIRAS
  
      HDFS-6387. HDFS CLI admin tool for creating & deleting an
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs
@ -36,6 +36,8 @@ function hadoop_usage
  echo "  journalnode          run the DFS journalnode"
  echo "  lsSnapshottableDir   list all snapshottable dirs owned by the current user"
  echo "                               Use -help to see options"
+  echo "  mover                run a utility to move block replicas across"
+  echo "                       storage types"
  echo "  namenode             run the DFS namenode"
  echo "                               Use -format to initialize the DFS filesystem"
  echo "  nfs3                 run an NFS version 3 gateway"
@ -159,6 +161,11 @@ case ${COMMAND} in
  lsSnapshottableDir)
    CLASS=org.apache.hadoop.hdfs.tools.snapshot.LsSnapshottableDir
  ;;
+  mover)
+    CLASS=org.apache.hadoop.hdfs.server.mover.Mover
+    hadoop_debug "Appending HADOOP_MOVER_OPTS onto HADOOP_OPTS"
+    HADOOP_OPTS="${HADOOP_OPTS} ${HADOOP_MOVER_OPTS}"
+  ;;
  namenode)
    daemon="true"
    CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs.cmd
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs.cmd
@ -47,7 +47,7 @@ if "%1" == "--config" (
      goto print_usage
  )

-  set hdfscommands=dfs namenode secondarynamenode journalnode zkfc datanode dfsadmin haadmin fsck balancer jmxget oiv oev fetchdt getconf groups snapshotDiff lsSnapshottableDir cacheadmin
+  set hdfscommands=dfs namenode secondarynamenode journalnode zkfc datanode dfsadmin haadmin fsck balancer jmxget oiv oev fetchdt getconf groups snapshotDiff lsSnapshottableDir cacheadmin mover
  for %%i in ( %hdfscommands% ) do (
    if %hdfs-command% == %%i set hdfscommand=true
  )
@ -150,6 +150,11 @@ goto :eof
  set CLASS=org.apache.hadoop.hdfs.tools.CacheAdmin
  goto :eof

+:mover
+  set CLASS=org.apache.hadoop.hdfs.server.mover.Mover
+  set HADOOP_OPTS=%HADOOP_OPTS% %HADOOP_MOVER_OPTS%
+  goto :eof
+
@rem This changes %1, %2 etc. Hence those cannot be used after calling this.
 :make_command_arguments
  if "%1" == "--config" (
@ -198,6 +203,7 @@ goto :eof
  @echo   lsSnapshottableDir   list all snapshottable dirs owned by the current user
  @echo 						Use -help to see options
  @echo   cacheadmin           configure the HDFS cache
+  @echo   mover                run a utility to move block replicas across storage types
  @echo.
  @echo Most commands print help when invoked w/o parameters.

--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/conf/blockStoragePolicy-site.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/conf/blockStoragePolicy-site.xml
@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration> 
+
+</configuration>
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/conf/hdfs-site.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/conf/hdfs-site.xml
@ -16,6 +16,7 @@

 <!-- Put site-specific property overrides in this file. -->

-<configuration>
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude"> 
+  <xi:include href="blockStoragePolicy-site.xml" />

 </configuration>
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockStoragePolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/BlockStoragePolicy.java
@ -0,0 +1,419 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs;
+
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.LinkedList;
+import java.util.List;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.XAttr;
+import org.apache.hadoop.fs.XAttr.NameSpace;
+
+/**
+ * A block storage policy describes how to select the storage types
+ * for the replicas of a block.
+ */
+@InterfaceAudience.Private
+public class BlockStoragePolicy {
+  public static final Log LOG = LogFactory.getLog(BlockStoragePolicy.class);
+
+  public static final String DFS_BLOCK_STORAGE_POLICIES_KEY
+      = "dfs.block.storage.policies";
+  public static final String DFS_BLOCK_STORAGE_POLICY_KEY_PREFIX
+      = "dfs.block.storage.policy.";
+  public static final String DFS_BLOCK_STORAGE_POLICY_CREATION_FALLBACK_KEY_PREFIX
+      = "dfs.block.storage.policy.creation-fallback.";
+  public static final String DFS_BLOCK_STORAGE_POLICY_REPLICATION_FALLBACK_KEY_PREFIX
+      = "dfs.block.storage.policy.replication-fallback.";
+  public static final String STORAGE_POLICY_XATTR_NAME = "bsp";
+  /** set the namespace to TRUSTED so that only privilege users can access */
+  public static final NameSpace XAttrNS = NameSpace.TRUSTED;
+
+  public static final int ID_BIT_LENGTH = 4;
+  public static final int ID_MAX = (1 << ID_BIT_LENGTH) - 1;
+  public static final byte ID_UNSPECIFIED = 0;
+
+  private static final Suite DEFAULT_SUITE = createDefaultSuite();
+
+  private static Suite createDefaultSuite() {
+    final BlockStoragePolicy[] policies = new BlockStoragePolicy[1 << ID_BIT_LENGTH];
+    final StorageType[] storageTypes = {StorageType.DISK};
+    final byte defaultPolicyId = 12;
+    policies[defaultPolicyId] = new BlockStoragePolicy(defaultPolicyId, "HOT",
+        storageTypes, StorageType.EMPTY_ARRAY, StorageType.EMPTY_ARRAY);
+    return new Suite(defaultPolicyId, policies);
+  }
+
+  /** A block storage policy suite. */
+  public static class Suite {
+    private final byte defaultPolicyID;
+    private final BlockStoragePolicy[] policies;
+    
+    private Suite(byte defaultPolicyID, BlockStoragePolicy[] policies) {
+      this.defaultPolicyID = defaultPolicyID;
+      this.policies = policies;
+    }
+    
+    /** @return the corresponding policy. */
+    public BlockStoragePolicy getPolicy(byte id) {
+      // id == 0 means policy not specified. 
+      return id == 0? getDefaultPolicy(): policies[id];
+    }
+
+    /** @return the default policy. */
+    public BlockStoragePolicy getDefaultPolicy() {
+      return getPolicy(defaultPolicyID);
+    }
+
+    public BlockStoragePolicy getPolicy(String policyName) {
+      if (policies != null) {
+        for (BlockStoragePolicy policy : policies) {
+          if (policy != null && policy.name.equals(policyName)) {
+            return policy;
+          }
+        }
+      }
+      return null;
+    }
+  }
+
+  /** A 4-bit policy ID */
+  private final byte id;
+  /** Policy name */
+  private final String name;
+
+  /** The storage types to store the replicas of a new block. */
+  private final StorageType[] storageTypes;
+  /** The fallback storage type for block creation. */
+  private final StorageType[] creationFallbacks;
+  /** The fallback storage type for replication. */
+  private final StorageType[] replicationFallbacks;
+
+  @VisibleForTesting
+  public BlockStoragePolicy(byte id, String name, StorageType[] storageTypes,
+      StorageType[] creationFallbacks, StorageType[] replicationFallbacks) {
+    this.id = id;
+    this.name = name;
+    this.storageTypes = storageTypes;
+    this.creationFallbacks = creationFallbacks;
+    this.replicationFallbacks = replicationFallbacks;
+  }
+
+  /**
+   * @return a list of {@link StorageType}s for storing the replicas of a block.
+   */
+  public List<StorageType> chooseStorageTypes(final short replication) {
+    final List<StorageType> types = new LinkedList<StorageType>();
+    int i = 0;
+    for(; i < replication && i < storageTypes.length; i++) {
+      types.add(storageTypes[i]);
+    }
+    final StorageType last = storageTypes[storageTypes.length - 1];
+    for(; i < replication; i++) {
+      types.add(last);
+    }
+    return types;
+  }
+
+  /**
+   * Choose the storage types for storing the remaining replicas, given the
+   * replication number and the storage types of the chosen replicas.
+   *
+   * @param replication the replication number.
+   * @param chosen the storage types of the chosen replicas.
+   * @return a list of {@link StorageType}s for storing the replicas of a block.
+   */
+  public List<StorageType> chooseStorageTypes(final short replication,
+      final Iterable<StorageType> chosen) {
+    return chooseStorageTypes(replication, chosen, null);
+  }
+
+  private List<StorageType> chooseStorageTypes(final short replication,
+      final Iterable<StorageType> chosen, final List<StorageType> excess) {
+    final List<StorageType> types = chooseStorageTypes(replication);
+    diff(types, chosen, excess);
+    return types;
+  }
+
+  /**
+   * Choose the storage types for storing the remaining replicas, given the
+   * replication number, the storage types of the chosen replicas and
+   * the unavailable storage types.  It uses fallback storage in case that
+   * the desired storage type is unavailable.  
+   *
+   * @param replication the replication number.
+   * @param chosen the storage types of the chosen replicas.
+   * @param unavailables the unavailable storage types.
+   * @param isNewBlock Is it for new block creation?
+   * @return a list of {@link StorageType}s for storing the replicas of a block.
+   */
+  public List<StorageType> chooseStorageTypes(final short replication,
+      final Iterable<StorageType> chosen,
+      final EnumSet<StorageType> unavailables,
+      final boolean isNewBlock) {
+    final List<StorageType> excess = new LinkedList<StorageType>();
+    final List<StorageType> storageTypes = chooseStorageTypes(
+        replication, chosen, excess);
+    final int expectedSize = storageTypes.size() - excess.size();
+    final List<StorageType> removed = new LinkedList<StorageType>();
+    for(int i = storageTypes.size() - 1; i >= 0; i--) {
+      // replace/remove unavailable storage types.
+      final StorageType t = storageTypes.get(i);
+      if (unavailables.contains(t)) {
+        final StorageType fallback = isNewBlock?
+            getCreationFallback(unavailables)
+            : getReplicationFallback(unavailables);
+        if (fallback == null) {
+          removed.add(storageTypes.remove(i));
+        } else {
+          storageTypes.set(i, fallback);
+        }
+      }
+    }
+    // remove excess storage types after fallback replacement.
+    diff(storageTypes, excess, null);
+    if (storageTypes.size() < expectedSize) {
+      LOG.warn("Failed to place enough replicas: expected size is " + expectedSize 
+          + " but only " + storageTypes.size() + " storage types can be selected "
+          + "(replication=" + replication
+          + ", selected=" + storageTypes
+          + ", unavailable=" + unavailables
+          + ", removed=" + removed
+          + ", policy=" + this + ")");
+    }
+    return storageTypes;
+  }
+
+  /**
+   * Compute the list difference t = t - c.
+   * Further, if e is not null, set e = e + c - t;
+   */
+  private static void diff(List<StorageType> t, Iterable<StorageType> c,
+      List<StorageType> e) {
+    for(StorageType storagetype : c) {
+      final int i = t.indexOf(storagetype);
+      if (i >= 0) {
+        t.remove(i);
+      } else if (e != null) {
+        e.add(storagetype);
+      }
+    }
+  }
+
+  /**
+   * Choose excess storage types for deletion, given the
+   * replication number and the storage types of the chosen replicas.
+   *
+   * @param replication the replication number.
+   * @param chosen the storage types of the chosen replicas.
+   * @return a list of {@link StorageType}s for deletion.
+   */
+  public List<StorageType> chooseExcess(final short replication,
+      final Iterable<StorageType> chosen) {
+    final List<StorageType> types = chooseStorageTypes(replication);
+    final List<StorageType> excess = new LinkedList<StorageType>();
+    diff(types, chosen, excess);
+    return excess;
+  }
+
+  /** @return the fallback {@link StorageType} for creation. */
+  public StorageType getCreationFallback(EnumSet<StorageType> unavailables) {
+    return getFallback(unavailables, creationFallbacks);
+  }
+  
+  /** @return the fallback {@link StorageType} for replication. */
+  public StorageType getReplicationFallback(EnumSet<StorageType> unavailables) {
+    return getFallback(unavailables, replicationFallbacks);
+  }
+
+  @Override
+  public int hashCode() {
+    return Byte.valueOf(id).hashCode();
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == this) {
+      return true;
+    } else if (obj == null || !(obj instanceof BlockStoragePolicy)) {
+      return false;
+    }
+    final BlockStoragePolicy that = (BlockStoragePolicy)obj;
+    return this.id == that.id;
+  }
+
+  @Override
+  public String toString() {
+    return getClass().getSimpleName() + "{" + name + ":" + id
+        + ", storageTypes=" + Arrays.asList(storageTypes)
+        + ", creationFallbacks=" + Arrays.asList(creationFallbacks)
+        + ", replicationFallbacks=" + Arrays.asList(replicationFallbacks);
+  }
+
+  public byte getId() {
+    return id;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  private static StorageType getFallback(EnumSet<StorageType> unavailables,
+      StorageType[] fallbacks) {
+    for(StorageType fb : fallbacks) {
+      if (!unavailables.contains(fb)) {
+        return fb;
+      }
+    }
+    return null;
+  }
+  
+  private static byte parseID(String idString, String element, Configuration conf) {
+    byte id = 0;
+    try {
+      id = Byte.parseByte(idString);
+    } catch(NumberFormatException nfe) {
+      throwIllegalArgumentException("Failed to parse policy ID \"" + idString
+          + "\" to a " + ID_BIT_LENGTH + "-bit integer", conf);
+    }
+    if (id < 0) {
+      throwIllegalArgumentException("Invalid policy ID: id = " + id
+          + " < 1 in \"" + element + "\"", conf);
+    } else if (id == 0) {
+      throw new IllegalArgumentException("Policy ID 0 is reserved: " + element);
+    } else if (id > ID_MAX) {
+      throwIllegalArgumentException("Invalid policy ID: id = " + id
+          + " > MAX = " + ID_MAX + " in \"" + element + "\"", conf);
+    }
+    return id;
+  }
+
+  private static StorageType[] parseStorageTypes(String[] strings) {
+    if (strings == null || strings.length == 0) {
+      return StorageType.EMPTY_ARRAY;
+    }
+    final StorageType[] types = new StorageType[strings.length];
+    for(int i = 0; i < types.length; i++) {
+      types[i] = StorageType.valueOf(strings[i].trim().toUpperCase());
+    }
+    return types;
+  }
+  
+  private static StorageType[] readStorageTypes(byte id, String keyPrefix,
+      Configuration conf) {
+    final String key = keyPrefix + id;
+    final String[] values = conf.getStrings(key);
+    try {
+      return parseStorageTypes(values);
+    } catch(Exception e) {
+      throw new IllegalArgumentException("Failed to parse " + key
+          + " \"" + conf.get(key), e);
+    }
+  }
+
+  private static BlockStoragePolicy readBlockStoragePolicy(byte id, String name,
+      Configuration conf) {
+    final StorageType[] storageTypes = readStorageTypes(id, 
+        DFS_BLOCK_STORAGE_POLICY_KEY_PREFIX, conf);
+    if (storageTypes.length == 0) {
+      throw new IllegalArgumentException(
+          DFS_BLOCK_STORAGE_POLICY_KEY_PREFIX + id + " is missing or is empty.");
+    }
+    final StorageType[] creationFallbacks = readStorageTypes(id, 
+        DFS_BLOCK_STORAGE_POLICY_CREATION_FALLBACK_KEY_PREFIX, conf);
+    final StorageType[] replicationFallbacks = readStorageTypes(id, 
+        DFS_BLOCK_STORAGE_POLICY_REPLICATION_FALLBACK_KEY_PREFIX, conf);
+    return new BlockStoragePolicy(id, name, storageTypes, creationFallbacks,
+        replicationFallbacks);
+  }
+
+  /** Read {@link Suite} from conf. */
+  public static Suite readBlockStorageSuite(Configuration conf) {
+    final BlockStoragePolicy[] policies = new BlockStoragePolicy[1 << ID_BIT_LENGTH];
+    final String[] values = conf.getStrings(DFS_BLOCK_STORAGE_POLICIES_KEY);
+    if (values == null) {
+      // conf property is missing, use default suite.
+      return DEFAULT_SUITE;
+    }
+    byte firstID = -1;
+    for(String v : values) {
+      v = v.trim();
+      final int i = v.indexOf(':');
+      if (i < 0) {
+        throwIllegalArgumentException("Failed to parse element \"" + v
+            + "\" (expected format is NAME:ID)", conf);
+      } else if (i == 0) {
+        throwIllegalArgumentException("Policy name is missing in \"" + v + "\"", conf);
+      } else if (i == v.length() - 1) {
+        throwIllegalArgumentException("Policy ID is missing in \"" + v + "\"", conf);
+      }
+      final String name = v.substring(0, i).trim();
+      for(int j = 1; j < policies.length; j++) {
+        if (policies[j] != null && policies[j].name.equals(name)) {
+          throwIllegalArgumentException("Policy name duplication: \""
+              + name + "\" appears more than once", conf);
+        }
+      }
+      
+      final byte id = parseID(v.substring(i + 1).trim(), v, conf);
+      if (policies[id] != null) {
+        throwIllegalArgumentException("Policy duplication: ID " + id
+            + " appears more than once", conf);
+      }
+      policies[id] = readBlockStoragePolicy(id, name, conf);
+      String prefix = "";
+      if (firstID == -1) {
+        firstID = id;
+        prefix = "(default) ";
+      }
+      LOG.info(prefix + policies[id]);
+    }
+    if (firstID == -1) {
+      throwIllegalArgumentException("Empty list is not allowed", conf);
+    }
+    return new Suite(firstID, policies);
+  }
+
+  public static String buildXAttrName() {
+    return XAttrNS.toString().toLowerCase() + "." + STORAGE_POLICY_XATTR_NAME;
+  }
+
+  public static XAttr buildXAttr(byte policyId) {
+    final String name = buildXAttrName();
+    return XAttrHelper.buildXAttr(name, new byte[] { policyId });
+  }
+
+  public static boolean isStoragePolicyXAttr(XAttr xattr) {
+    return xattr != null && xattr.getNameSpace() == BlockStoragePolicy.XAttrNS
+        && xattr.getName().equals(BlockStoragePolicy.STORAGE_POLICY_XATTR_NAME);
+  }
+
+  private static void throwIllegalArgumentException(String message,
+      Configuration conf) {
+    throw new IllegalArgumentException(message + " in "
+        + DFS_BLOCK_STORAGE_POLICIES_KEY + " \""
+        + conf.get(DFS_BLOCK_STORAGE_POLICIES_KEY) + "\".");
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
@ -1763,6 +1763,25 @@ public class DFSClient implements java.io.Closeable, RemotePeerFactory,
    }
  }

+  /**
+   * Set storage policy for an existing file/directory
+   * @param src file/directory name
+   * @param policyName name of the storage policy
+   */
+  public void setStoragePolicy(String src, String policyName)
+      throws IOException {
+    try {
+      namenode.setStoragePolicy(src, policyName);
+    } catch (RemoteException e) {
+      throw e.unwrapRemoteException(AccessControlException.class,
+                                    FileNotFoundException.class,
+                                    SafeModeException.class,
+                                    NSQuotaExceededException.class,
+                                    UnresolvedPathException.class,
+                                    SnapshotAccessControlException.class);
+    }
+  }
+
  /**
   * Rename file or directory.
   * @see ClientProtocol#rename(String, String)
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
@ -232,9 +232,6 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
  public static final String  DFS_NAMENODE_MIN_SUPPORTED_DATANODE_VERSION_KEY = "dfs.namenode.min.supported.datanode.version";
  public static final String  DFS_NAMENODE_MIN_SUPPORTED_DATANODE_VERSION_DEFAULT = "3.0.0-SNAPSHOT";

-  public static final String DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK = "dfs.namenode.randomize-block-locations-per-block";
-  public static final boolean DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK_DEFAULT = false;
-
  public static final String  DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY = "dfs.namenode.edits.dir.minimum";
  public static final int     DFS_NAMENODE_EDITS_DIR_MINIMUM_DEFAULT = 1;

@ -386,6 +383,12 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
  public static final int     DFS_BALANCER_MOVERTHREADS_DEFAULT = 1000;
  public static final String  DFS_BALANCER_DISPATCHERTHREADS_KEY = "dfs.balancer.dispatcherThreads";
  public static final int     DFS_BALANCER_DISPATCHERTHREADS_DEFAULT = 200;
+
+  public static final String  DFS_MOVER_MOVEDWINWIDTH_KEY = "dfs.mover.movedWinWidth";
+  public static final long    DFS_MOVER_MOVEDWINWIDTH_DEFAULT = 5400*1000L;
+  public static final String  DFS_MOVER_MOVERTHREADS_KEY = "dfs.mover.moverThreads";
+  public static final int     DFS_MOVER_MOVERTHREADS_DEFAULT = 1000;
+
  public static final String  DFS_DATANODE_ADDRESS_KEY = "dfs.datanode.address";
  public static final int     DFS_DATANODE_DEFAULT_PORT = 50010;
  public static final String  DFS_DATANODE_ADDRESS_DEFAULT = "0.0.0.0:" + DFS_DATANODE_DEFAULT_PORT;
@ -449,6 +452,15 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
  public static final Class<BlockPlacementPolicyDefault> DFS_BLOCK_REPLICATOR_CLASSNAME_DEFAULT = BlockPlacementPolicyDefault.class;
  public static final String  DFS_REPLICATION_MAX_KEY = "dfs.replication.max";
  public static final int     DFS_REPLICATION_MAX_DEFAULT = 512;
+  public static final String DFS_BLOCK_STORAGE_POLICIES_KEY
+        = BlockStoragePolicy.DFS_BLOCK_STORAGE_POLICIES_KEY;
+  public static final String DFS_BLOCK_STORAGE_POLICY_KEY_PREFIX
+        = BlockStoragePolicy.DFS_BLOCK_STORAGE_POLICY_KEY_PREFIX;
+  public static final String DFS_BLOCK_STORAGE_POLICY_CREATION_FALLBACK_KEY_PREFIX
+        = BlockStoragePolicy.DFS_BLOCK_STORAGE_POLICY_CREATION_FALLBACK_KEY_PREFIX;
+  public static final String DFS_BLOCK_STORAGE_POLICY_REPLICATION_FALLBACK_KEY_PREFIX
+        = BlockStoragePolicy.DFS_BLOCK_STORAGE_POLICY_REPLICATION_FALLBACK_KEY_PREFIX;
+
  public static final String  DFS_DF_INTERVAL_KEY = "dfs.df.interval";
  public static final int     DFS_DF_INTERVAL_DEFAULT = 60000;
  public static final String  DFS_BLOCKREPORT_INTERVAL_MSEC_KEY = "dfs.blockreport.intervalMsec";
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java
@ -41,6 +41,7 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;

+import com.google.common.base.Preconditions;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.crypto.CipherSuite;
 import org.apache.hadoop.fs.CanSetDropBehind;
@ -76,6 +77,7 @@ import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
 import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
 import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
 import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException;
+import org.apache.hadoop.hdfs.server.namenode.RetryStartFileException;
 import org.apache.hadoop.hdfs.server.namenode.SafeModeException;
 import org.apache.hadoop.io.EnumSetWritable;
 import org.apache.hadoop.io.IOUtils;
@ -126,6 +128,13 @@ import com.google.common.cache.RemovalNotification;
 public class DFSOutputStream extends FSOutputSummer
    implements Syncable, CanSetDropBehind {
  private static final int MAX_PACKETS = 80; // each packet 64K, total 5MB
+  /**
+   * Number of times to retry creating a file when there are transient 
+   * errors (typically related to encryption zones and KeyProvider operations).
+   */
+  @VisibleForTesting
+  public static final int CREATE_RETRY_COUNT = 10;
+
  private final DFSClient dfsClient;
  private final long dfsclientSlowLogThresholdMs;
  private Socket s;
@ -1651,23 +1660,46 @@ public class DFSOutputStream extends FSOutputSummer
      short replication, long blockSize, Progressable progress, int buffersize,
      DataChecksum checksum, String[] favoredNodes,
      List<CipherSuite> cipherSuites) throws IOException {
-    final HdfsFileStatus stat;
-    try {
-      stat = dfsClient.namenode.create(src, masked, dfsClient.clientName,
-          new EnumSetWritable<CreateFlag>(flag), createParent, replication,
-          blockSize, cipherSuites);
-    } catch(RemoteException re) {
-      throw re.unwrapRemoteException(AccessControlException.class,
-                                     DSQuotaExceededException.class,
-                                     FileAlreadyExistsException.class,
-                                     FileNotFoundException.class,
-                                     ParentNotDirectoryException.class,
-                                     NSQuotaExceededException.class,
-                                     SafeModeException.class,
-                                     UnresolvedPathException.class,
-                                     SnapshotAccessControlException.class,
-                                     UnknownCipherSuiteException.class);
+    HdfsFileStatus stat = null;
+
+    // Retry the create if we get a RetryStartFileException up to a maximum
+    // number of times
+    boolean shouldRetry = true;
+    int retryCount = CREATE_RETRY_COUNT;
+    while (shouldRetry) {
+      shouldRetry = false;
+      try {
+        stat = dfsClient.namenode.create(src, masked, dfsClient.clientName,
+            new EnumSetWritable<CreateFlag>(flag), createParent, replication,
+            blockSize, cipherSuites);
+        break;
+      } catch (RemoteException re) {
+        IOException e = re.unwrapRemoteException(
+            AccessControlException.class,
+            DSQuotaExceededException.class,
+            FileAlreadyExistsException.class,
+            FileNotFoundException.class,
+            ParentNotDirectoryException.class,
+            NSQuotaExceededException.class,
+            RetryStartFileException.class,
+            SafeModeException.class,
+            UnresolvedPathException.class,
+            SnapshotAccessControlException.class,
+            UnknownCipherSuiteException.class);
+        if (e instanceof RetryStartFileException) {
+          if (retryCount > 0) {
+            shouldRetry = true;
+            retryCount--;
+          } else {
+            throw new IOException("Too many retries because of encryption" +
+                " zone operations", e);
+          }
+        } else {
+          throw e;
+        }
+      }
    }
+    Preconditions.checkNotNull(stat, "HdfsFileStatus should not be null!");
    final DFSOutputStream out = new DFSOutputStream(dfsClient, src, stat,
        flag, progress, checksum, favoredNodes);
    out.start();
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
@ -471,7 +471,39 @@ public class DistributedFileSystem extends FileSystem {
      }
    }.resolve(this, absF);
  }
-  
+
+  /**
+   * Set the source path to the specified storage policy.
+   *
+   * @param src The source path referring to either a directory or a file.
+   * @param policyName The name of the storage policy.
+   */
+  public void setStoragePolicy(final Path src, final String policyName)
+      throws IOException {
+    statistics.incrementWriteOps(1);
+    Path absF = fixRelativePart(src);
+    new FileSystemLinkResolver<Void>() {
+      @Override
+      public Void doCall(final Path p)
+          throws IOException, UnresolvedLinkException {
+        dfs.setStoragePolicy(getPathName(p), policyName);
+        return null;
+      }
+      @Override
+      public Void next(final FileSystem fs, final Path p)
+          throws IOException {
+        if (fs instanceof DistributedFileSystem) {
+          ((DistributedFileSystem) fs).setStoragePolicy(p, policyName);
+          return null;
+        } else {
+          throw new UnsupportedOperationException(
+              "Cannot perform setStoragePolicy on a non-DistributedFileSystem: "
+                  + src + " -> " + p);
+        }
+      }
+    }.resolve(this, absF);
+  }
+
  /**
   * Move blocks from srcs to trg and delete srcs afterwards.
   * The file block sizes must be the same.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/StorageType.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/StorageType.java
@ -33,9 +33,11 @@ import org.apache.hadoop.classification.InterfaceStability;
 public enum StorageType {
  DISK,
  SSD,
-  RAM_DISK;
+  ARCHIVE,
+  RAM_DISK;  

  public static final StorageType DEFAULT = DISK;
+  
  public static final StorageType[] EMPTY_ARRAY = {};
  
  private static final StorageType[] VALUES = values();
@ -43,4 +45,4 @@ public enum StorageType {
  public static List<StorageType> asList() {
    return Arrays.asList(VALUES);
  }
-}
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
@ -259,6 +259,20 @@ public interface ClientProtocol {
      FileNotFoundException, SafeModeException, UnresolvedLinkException,
      SnapshotAccessControlException, IOException;

+  /**
+   * Set the storage policy for a file/directory
+   * @param src Path of an existing file/directory. 
+   * @param policyName The name of the storage policy
+   * @throws SnapshotAccessControlException If access is denied
+   * @throws UnresolvedLinkException if <code>src</code> contains a symlink
+   * @throws FileNotFoundException If file/dir <code>src</code> is not found
+   * @throws QuotaExceededException If changes violate the quota restriction
+   */
+  @Idempotent
+  public void setStoragePolicy(String src, String policyName)
+      throws SnapshotAccessControlException, UnresolvedLinkException,
+      FileNotFoundException, QuotaExceededException, IOException;
+
  /**
   * Set permissions for an existing file/directory.
   * 
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java
@ -160,5 +160,8 @@ public class HdfsConstants {
      = DFSUtil.string2Bytes(DOT_SNAPSHOT_DIR);
  
  public static final String SEPARATOR_DOT_SNAPSHOT_DIR
-      = Path.SEPARATOR + DOT_SNAPSHOT_DIR; 
+      = Path.SEPARATOR + DOT_SNAPSHOT_DIR;
+
+  public static final String SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR
+      = Path.SEPARATOR + DOT_SNAPSHOT_DIR + Path.SEPARATOR;
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsFileStatus.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsFileStatus.java
@ -51,6 +51,7 @@ public class HdfsFileStatus {
  
  // Used by dir, not including dot and dotdot. Always zero for a regular file.
  private final int childrenNum;
+  private final byte storagePolicy;
  
  public static final byte[] EMPTY_NAME = new byte[0];

@ -73,7 +74,7 @@ public class HdfsFileStatus {
      long blocksize, boolean isLazyPersist, long modification_time,
      long access_time, FsPermission permission, String owner,
      String group, byte[] symlink, byte[] path, long fileId,
-      int childrenNum, FileEncryptionInfo feInfo) {
+      int childrenNum, FileEncryptionInfo feInfo, byte storagePolicy) {
    this.length = length;
    this.isdir = isdir;
    this.block_replication = (short)block_replication;
@ -93,6 +94,7 @@ public class HdfsFileStatus {
    this.fileId = fileId;
    this.childrenNum = childrenNum;
    this.feInfo = feInfo;
+    this.storagePolicy = storagePolicy;
  }

  /**
@ -261,6 +263,11 @@ public class HdfsFileStatus {
    return childrenNum;
  }

+  /** @return the storage policy id */
+  public final byte getStoragePolicy() {
+    return storagePolicy;
+  }
+
  public final FileStatus makeQualified(URI defaultUri, Path path) {
    return new FileStatus(getLen(), isDir(), getReplication(),
        getBlockSize(), isLazyPersist(), getModificationTime(),
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsLocatedFileStatus.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsLocatedFileStatus.java
@ -35,7 +35,7 @@ import org.apache.hadoop.hdfs.DFSUtil;
@InterfaceStability.Evolving
 public class HdfsLocatedFileStatus extends HdfsFileStatus {
  private final LocatedBlocks locations;
-  
+
  /**
   * Constructor
   * 
@ -58,13 +58,14 @@ public class HdfsLocatedFileStatus extends HdfsFileStatus {
      int block_replication, long blocksize, boolean isLazyPersist,
      long modification_time, long access_time, FsPermission permission,
      String owner, String group, byte[] symlink, byte[] path, long fileId,
-      LocatedBlocks locations, int childrenNum, FileEncryptionInfo feInfo) {
+      LocatedBlocks locations, int childrenNum, FileEncryptionInfo feInfo,
+      byte storagePolicy) {
    super(length, isdir, block_replication, blocksize, isLazyPersist,
        modification_time, access_time, permission, owner, group, symlink,
-        path, fileId, childrenNum, feInfo);
+        path, fileId, childrenNum, feInfo, storagePolicy);
    this.locations = locations;
  }
-	
+
  public LocatedBlocks getBlockLocations() {
    return locations;
  }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/SnapshottableDirectoryStatus.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/SnapshottableDirectoryStatus.java
@ -24,6 +24,7 @@ import java.util.Date;

 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSUtil;

 /**
@ -61,7 +62,7 @@ public class SnapshottableDirectoryStatus {
      int snapshotNumber, int snapshotQuota, byte[] parentFullPath) {
    this.dirStatus = new HdfsFileStatus(0, true, 0, 0, false, modification_time,
        access_time, permission, owner, group, null, localName, inodeId,
-        childrenNum, null);
+        childrenNum, null, BlockStoragePolicy.ID_UNSPECIFIED);
    this.snapshotNumber = snapshotNumber;
    this.snapshotQuota = snapshotQuota;
    this.parentFullPath = parentFullPath;
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolServerSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolServerSideTranslatorPB.java
@ -173,6 +173,8 @@ import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetRep
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetReplicationResponseProto;
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetSafeModeRequestProto;
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetSafeModeResponseProto;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetStoragePolicyRequestProto;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetStoragePolicyResponseProto;
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetTimesRequestProto;
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetTimesResponseProto;
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.UpdateBlockForPipelineRequestProto;
@ -236,6 +238,8 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
  static final GetSnapshottableDirListingResponseProto 
      NULL_GET_SNAPSHOTTABLE_DIR_LISTING_RESPONSE = 
      GetSnapshottableDirListingResponseProto.newBuilder().build();
+  static final SetStoragePolicyResponseProto VOID_SET_STORAGE_POLICY_RESPONSE =
+      SetStoragePolicyResponseProto.newBuilder().build();

  private static final CreateResponseProto VOID_CREATE_RESPONSE = 
  CreateResponseProto.newBuilder().build();
@ -1413,6 +1417,18 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
    return VOID_CHECKACCESS_RESPONSE;
  }

+  @Override
+  public SetStoragePolicyResponseProto setStoragePolicy(
+      RpcController controller, SetStoragePolicyRequestProto request)
+      throws ServiceException {
+    try {
+      server.setStoragePolicy(request.getSrc(), request.getPolicyName());
+    } catch (IOException e) {
+      throw new ServiceException(e);
+    }
+    return VOID_SET_STORAGE_POLICY_RESPONSE;
+  }
+
  public GetCurrentEditLogTxidResponseProto getCurrentEditLogTxid(RpcController controller,
      GetCurrentEditLogTxidRequestProto req) throws ServiceException {
    try {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
@ -64,7 +64,9 @@ import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
 import org.apache.hadoop.hdfs.protocol.NSQuotaExceededException;
+import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
 import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
+import org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException;
 import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport;
 import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
 import org.apache.hadoop.hdfs.protocol.proto.AclProtos.GetAclStatusRequestProto;
@ -152,6 +154,7 @@ import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetTim
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.UpdateBlockForPipelineRequestProto;
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.UpdatePipelineRequestProto;
 import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CheckAccessRequestProto;
+import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.SetStoragePolicyRequestProto;
 import org.apache.hadoop.hdfs.protocol.proto.EncryptionZonesProtos;
 import org.apache.hadoop.hdfs.protocol.proto.EncryptionZonesProtos.CreateEncryptionZoneRequestProto;
 import org.apache.hadoop.hdfs.protocol.proto.EncryptionZonesProtos.GetEZForPathRequestProto;
@ -1435,6 +1438,19 @@ public class ClientNamenodeProtocolTranslatorPB implements
    }
  }

+  @Override
+  public void setStoragePolicy(String src, String policyName)
+      throws SnapshotAccessControlException, UnresolvedLinkException,
+      FileNotFoundException, QuotaExceededException, IOException {
+    SetStoragePolicyRequestProto req = SetStoragePolicyRequestProto
+        .newBuilder().setSrc(src).setPolicyName(policyName).build();
+    try {
+      rpcProxy.setStoragePolicy(null, req);
+    } catch (ServiceException e) {
+      throw ProtobufHelper.getRemoteException(e);
+    }
+  }
+
  public long getCurrentEditLogTxid() throws IOException {
    GetCurrentEditLogTxidRequestProto req = GetCurrentEditLogTxidRequestProto
        .getDefaultInstance();
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
@ -44,6 +44,7 @@ import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
 import org.apache.hadoop.ha.proto.HAServiceProtocolProtos;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.StorageType;
 import org.apache.hadoop.hdfs.inotify.Event;
@ -176,6 +177,7 @@ import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifie
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NodeType;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
+import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.StorageInfo;
 import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature;
 import org.apache.hadoop.hdfs.server.namenode.INodeId;
@ -893,9 +895,25 @@ public class PBHelper {
    }
    builder.addAllTargets(convert(cmd.getTargets()))
           .addAllTargetStorageUuids(convert(cmd.getTargetStorageIDs()));
+    StorageType[][] types = cmd.getTargetStorageTypes();
+    if (types != null) {
+      builder.addAllTargetStorageTypes(convert(types));
+    }
    return builder.build();
  }
-  
+
+  private static List<StorageTypesProto> convert(StorageType[][] types) {
+    List<StorageTypesProto> list = Lists.newArrayList();
+    if (types != null) {
+      for (StorageType[] ts : types) {
+        StorageTypesProto.Builder builder = StorageTypesProto.newBuilder();
+        builder.addAllStorageTypes(convertStorageTypes(ts));
+        list.add(builder.build());
+      }
+    }
+    return list;
+  }
+
  public static BlockIdCommandProto convert(BlockIdCommand cmd) {
    BlockIdCommandProto.Builder builder = BlockIdCommandProto.newBuilder()
        .setBlockPoolId(cmd.getBlockPoolId());
@ -1024,7 +1042,7 @@ public class PBHelper {
    } else {
      for(int i = 0; i < targetStorageTypes.length; i++) {
        List<StorageTypeProto> p = targetStorageTypesList.get(i).getStorageTypesList();
-        targetStorageTypes[i] = p.toArray(new StorageType[p.size()]);
+        targetStorageTypes[i] = convertStorageTypes(p, targets[i].length);
      }
    }

@ -1338,8 +1356,9 @@ public class PBHelper {
        fs.hasFileId()? fs.getFileId(): INodeId.GRANDFATHER_INODE_ID,
        fs.hasLocations() ? PBHelper.convert(fs.getLocations()) : null,
        fs.hasChildrenNum() ? fs.getChildrenNum() : -1,
-        fs.hasFileEncryptionInfo() ? convert(fs.getFileEncryptionInfo()) :
-            null);
+        fs.hasFileEncryptionInfo() ? convert(fs.getFileEncryptionInfo()) : null,
+        fs.hasStoragePolicy() ? (byte) fs.getStoragePolicy()
+            : BlockStoragePolicy.ID_UNSPECIFIED);
  }

  public static SnapshottableDirectoryStatus convert(
@ -1386,7 +1405,8 @@ public class PBHelper {
      setGroup(fs.getGroup()).
      setFileId(fs.getFileId()).
      setChildrenNum(fs.getChildrenNum()).
-      setPath(ByteString.copyFrom(fs.getLocalNameInBytes()));
+      setPath(ByteString.copyFrom(fs.getLocalNameInBytes())).
+      setStoragePolicy(fs.getStoragePolicy());
    if (fs.isSymlink())  {
      builder.setSymlink(ByteString.copyFrom(fs.getSymlinkInBytes()));
    }
@ -1394,7 +1414,8 @@ public class PBHelper {
      builder.setFileEncryptionInfo(convert(fs.getFileEncryptionInfo()));
    }
    if (fs instanceof HdfsLocatedFileStatus) {
-      LocatedBlocks locations = ((HdfsLocatedFileStatus)fs).getBlockLocations();
+      final HdfsLocatedFileStatus lfs = (HdfsLocatedFileStatus) fs;
+      LocatedBlocks locations = lfs.getBlockLocations();
      if (locations != null) {
        builder.setLocations(PBHelper.convert(locations));
      }
@ -1707,6 +1728,8 @@ public class PBHelper {
      return StorageTypeProto.DISK;
    case SSD:
      return StorageTypeProto.SSD;
+    case ARCHIVE:
+      return StorageTypeProto.ARCHIVE;
    case RAM_DISK:
      return StorageTypeProto.RAM_DISK;
    default:
@ -1737,6 +1760,8 @@ public class PBHelper {
        return StorageType.DISK;
      case SSD:
        return StorageType.SSD;
+      case ARCHIVE:
+        return StorageType.ARCHIVE;
      case RAM_DISK:
        return StorageType.RAM_DISK;
      default:
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
@ -23,7 +23,6 @@ import java.io.IOException;
 import java.io.PrintStream;
 import java.net.URI;
 import java.text.DateFormat;
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
@ -54,6 +53,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault
 import org.apache.hadoop.hdfs.server.namenode.UnsupportedActionException;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Time;
 import org.apache.hadoop.util.Tool;
@ -270,7 +270,7 @@ public class Balancer {
    //   over-utilized, above-average, below-average and under-utilized.
    long overLoadedBytes = 0L, underLoadedBytes = 0L;
    for(DatanodeStorageReport r : reports) {
-      final DDatanode dn = dispatcher.newDatanode(r);
+      final DDatanode dn = dispatcher.newDatanode(r.getDatanodeInfo());
      for(StorageType t : StorageType.asList()) {
        final Double utilization = policy.getUtilization(r, t);
        if (utilization == null) { // datanode does not have such storage type 
@ -294,7 +294,7 @@ public class Balancer {
          }
          g = s;
        } else {
-          g = dn.addStorageGroup(t, maxSize2Move);
+          g = dn.addTarget(t, maxSize2Move);
          if (thresholdDiff <= 0) { // within threshold
            belowAvgUtilized.add(g);
          } else {
@ -548,15 +548,10 @@ public class Balancer {
    final Formatter formatter = new Formatter(System.out);
    System.out.println("Time Stamp               Iteration#  Bytes Already Moved  Bytes Left To Move  Bytes Being Moved");
    
-    final List<NameNodeConnector> connectors
-        = new ArrayList<NameNodeConnector>(namenodes.size());
+    List<NameNodeConnector> connectors = Collections.emptyList();
    try {
-      for (URI uri : namenodes) {
-        final NameNodeConnector nnc = new NameNodeConnector(
-            Balancer.class.getSimpleName(), uri, BALANCER_ID_PATH, conf);
-        nnc.getKeyManager().startBlockKeyUpdater();
-        connectors.add(nnc);
-      }
+      connectors = NameNodeConnector.newNameNodeConnectors(namenodes, 
+            Balancer.class.getSimpleName(), BALANCER_ID_PATH, conf);
    
      boolean done = false;
      for(int iteration = 0; !done; iteration++) {
@ -581,7 +576,7 @@ public class Balancer {
      }
    } finally {
      for(NameNodeConnector nnc : connectors) {
-        nnc.close();
+        IOUtils.cleanup(LOG, nnc);
      }
    }
    return ExitStatus.SUCCESS.getExitCode();
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Dispatcher.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Dispatcher.java
@ -43,12 +43,15 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicLong;

+import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.hdfs.StorageType;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
@ -86,7 +89,11 @@ public class Dispatcher {
  private static final long MAX_BLOCKS_SIZE_TO_FETCH = 2 * GB;

  private static final int MAX_NO_PENDING_MOVE_ITERATIONS = 5;
-  private static final long DELAY_AFTER_ERROR = 10 * 1000L; // 10 seconds
+  /**
+   * the period of time to delay the usage of a DataNode after hitting
+   * errors when using it for migrating data
+   */
+  private static long delayAfterErrors = 10 * 1000;

  private final NameNodeConnector nnc;
  private final SaslDataTransferClient saslClient;
@ -103,12 +110,14 @@ public class Dispatcher {
  private final MovedBlocks<StorageGroup> movedBlocks;

  /** Map (datanodeUuid,storageType -> StorageGroup) */
-  private final StorageGroupMap storageGroupMap = new StorageGroupMap();
+  private final StorageGroupMap<StorageGroup> storageGroupMap
+      = new StorageGroupMap<StorageGroup>();

  private NetworkTopology cluster;

  private final ExecutorService moveExecutor;
  private final ExecutorService dispatchExecutor;
+
  /** The maximum number of concurrent blocks moves at a datanode */
  private final int maxConcurrentMovesPerNode;

@ -140,18 +149,18 @@ public class Dispatcher {
    }
  }

-  static class StorageGroupMap {
+  public static class StorageGroupMap<G extends StorageGroup> {
    private static String toKey(String datanodeUuid, StorageType storageType) {
      return datanodeUuid + ":" + storageType;
    }

-    private final Map<String, StorageGroup> map = new HashMap<String, StorageGroup>();
+    private final Map<String, G> map = new HashMap<String, G>();

-    StorageGroup get(String datanodeUuid, StorageType storageType) {
+    public G get(String datanodeUuid, StorageType storageType) {
      return map.get(toKey(datanodeUuid, storageType));
    }

-    void put(StorageGroup g) {
+    public void put(G g) {
      final String key = toKey(g.getDatanodeInfo().getDatanodeUuid(), g.storageType);
      final StorageGroup existing = map.put(key, g);
      Preconditions.checkState(existing == null);
@ -164,24 +173,32 @@ public class Dispatcher {
    void clear() {
      map.clear();
    }
+
+    public Collection<G> values() {
+      return map.values();
+    }
  }

  /** This class keeps track of a scheduled block move */
-  private class PendingMove {
+  public class PendingMove {
    private DBlock block;
    private Source source;
    private DDatanode proxySource;
    private StorageGroup target;

-    private PendingMove() {
+    private PendingMove(Source source, StorageGroup target) {
+      this.source = source;
+      this.target = target;
    }

    @Override
    public String toString() {
-      final Block b = block.getBlock();
-      return b + " with size=" + b.getNumBytes() + " from "
-          + source.getDisplayName() + " to " + target.getDisplayName()
-          + " through " + proxySource.datanode;
+      final Block b = block != null ? block.getBlock() : null;
+      String bStr = b != null ? (b + " with size=" + b.getNumBytes() + " ")
+          : " ";
+      return bStr + "from " + source.getDisplayName() + " to " + target
+          .getDisplayName() + " through " + (proxySource != null ? proxySource
+          .datanode : "");
    }

    /**
@ -191,9 +208,11 @@ public class Dispatcher {
     * @return true if a block and its proxy are chosen; false otherwise
     */
    private boolean chooseBlockAndProxy() {
+      // source and target must have the same storage type
+      final StorageType t = source.getStorageType();
      // iterate all source's blocks until find a good one
      for (Iterator<DBlock> i = source.getBlockIterator(); i.hasNext();) {
-        if (markMovedIfGoodBlock(i.next())) {
+        if (markMovedIfGoodBlock(i.next(), t)) {
          i.remove();
          return true;
        }
@ -204,10 +223,10 @@ public class Dispatcher {
    /**
     * @return true if the given block is good for the tentative move.
     */
-    private boolean markMovedIfGoodBlock(DBlock block) {
+    private boolean markMovedIfGoodBlock(DBlock block, StorageType targetStorageType) {
      synchronized (block) {
        synchronized (movedBlocks) {
-          if (isGoodBlockCandidate(source, target, block)) {
+          if (isGoodBlockCandidate(source, target, targetStorageType, block)) {
            this.block = block;
            if (chooseProxySource()) {
              movedBlocks.put(block);
@ -300,12 +319,13 @@ public class Dispatcher {
        LOG.info("Successfully moved " + this);
      } catch (IOException e) {
        LOG.warn("Failed to move " + this + ": " + e.getMessage());
+        target.getDDatanode().setHasFailure();
        // Proxy or target may have some issues, delay before using these nodes
        // further in order to avoid a potential storm of "threads quota
        // exceeded" warnings when the dispatcher gets out of sync with work
        // going on in datanodes.
-        proxySource.activateDelay(DELAY_AFTER_ERROR);
-        target.getDDatanode().activateDelay(DELAY_AFTER_ERROR);
+        proxySource.activateDelay(delayAfterErrors);
+        target.getDDatanode().activateDelay(delayAfterErrors);
      } finally {
        IOUtils.closeStream(out);
        IOUtils.closeStream(in);
@ -356,10 +376,23 @@ public class Dispatcher {
  }

  /** A class for keeping track of block locations in the dispatcher. */
-  private static class DBlock extends MovedBlocks.Locations<StorageGroup> {
-    DBlock(Block block) {
+  public static class DBlock extends MovedBlocks.Locations<StorageGroup> {
+    public DBlock(Block block) {
      super(block);
    }
+
+    @Override
+    public synchronized boolean isLocatedOn(StorageGroup loc) {
+      // currently we only check if replicas are located on the same DataNodes
+      // since we do not have the capability to store two replicas in the same
+      // DataNode even though they are on two different storage types
+      for (StorageGroup existing : locations) {
+        if (existing.getDatanodeInfo().equals(loc.getDatanodeInfo())) {
+          return true;
+        }
+      }
+      return false;
+    }
  }

  /** The class represents a desired move. */
@ -378,10 +411,10 @@ public class Dispatcher {
  }

  /** A class that keeps track of a datanode. */
-  static class DDatanode {
+  public static class DDatanode {

    /** A group of storages in a datanode with the same storage type. */
-    class StorageGroup {
+    public class StorageGroup {
      final StorageType storageType;
      final long maxSize2Move;
      private long scheduledSize = 0L;
@ -390,18 +423,26 @@ public class Dispatcher {
        this.storageType = storageType;
        this.maxSize2Move = maxSize2Move;
      }
+      
+      public StorageType getStorageType() {
+        return storageType;
+      }

      private DDatanode getDDatanode() {
        return DDatanode.this;
      }

-      DatanodeInfo getDatanodeInfo() {
+      public DatanodeInfo getDatanodeInfo() {
        return DDatanode.this.datanode;
      }

      /** Decide if still need to move more bytes */
-      synchronized boolean hasSpaceForScheduling() {
-        return availableSizeToMove() > 0L;
+      boolean hasSpaceForScheduling() {
+        return hasSpaceForScheduling(0L);
+      }
+
+      synchronized boolean hasSpaceForScheduling(long size) {
+        return availableSizeToMove() > size;
      }

      /** @return the total number of bytes that need to be moved */
@ -410,7 +451,7 @@ public class Dispatcher {
      }

      /** increment scheduled size */
-      synchronized void incScheduledSize(long size) {
+      public synchronized void incScheduledSize(long size) {
        scheduledSize += size;
      }

@ -424,6 +465,18 @@ public class Dispatcher {
        scheduledSize = 0L;
      }

+      private PendingMove addPendingMove(DBlock block, final PendingMove pm) {
+        if (getDDatanode().addPendingBlock(pm)) {
+          if (pm.markMovedIfGoodBlock(block, getStorageType())) {
+            incScheduledSize(pm.block.getNumBytes());
+            return pm;
+          } else {
+            getDDatanode().removePendingBlock(pm);
+          }
+        }
+        return null;
+      }
+
      /** @return the name for display */
      String getDisplayName() {
        return datanode + ":" + storageType;
@ -436,38 +489,46 @@ public class Dispatcher {
    }

    final DatanodeInfo datanode;
-    final EnumMap<StorageType, StorageGroup> storageMap
+    private final EnumMap<StorageType, Source> sourceMap
+        = new EnumMap<StorageType, Source>(StorageType.class);
+    private final EnumMap<StorageType, StorageGroup> targetMap
        = new EnumMap<StorageType, StorageGroup>(StorageType.class);
    protected long delayUntil = 0L;
    /** blocks being moved but not confirmed yet */
    private final List<PendingMove> pendings;
+    private volatile boolean hasFailure = false;
    private final int maxConcurrentMoves;

    @Override
    public String toString() {
-      return getClass().getSimpleName() + ":" + datanode + ":" + storageMap.values();
+      return getClass().getSimpleName() + ":" + datanode;
    }

-    private DDatanode(DatanodeStorageReport r, int maxConcurrentMoves) {
-      this.datanode = r.getDatanodeInfo();
+    private DDatanode(DatanodeInfo datanode, int maxConcurrentMoves) {
+      this.datanode = datanode;
      this.maxConcurrentMoves = maxConcurrentMoves;
      this.pendings = new ArrayList<PendingMove>(maxConcurrentMoves);
    }

-    private void put(StorageType storageType, StorageGroup g) {
-      final StorageGroup existing = storageMap.put(storageType, g);
+    public DatanodeInfo getDatanodeInfo() {
+      return datanode;
+    }
+
+    private static <G extends StorageGroup> void put(StorageType storageType,
+        G g, EnumMap<StorageType, G> map) {
+      final StorageGroup existing = map.put(storageType, g);
      Preconditions.checkState(existing == null);
    }

-    StorageGroup addStorageGroup(StorageType storageType, long maxSize2Move) {
+    public StorageGroup addTarget(StorageType storageType, long maxSize2Move) {
      final StorageGroup g = new StorageGroup(storageType, maxSize2Move);
-      put(storageType, g);
+      put(storageType, g, targetMap);
      return g;
    }

-    Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) {
+    public Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) {
      final Source s = d.new Source(storageType, maxSize2Move, this);
-      put(storageType, s);
+      put(storageType, s, sourceMap);
      return s;
    }

@ -505,10 +566,14 @@ public class Dispatcher {
    synchronized boolean removePendingBlock(PendingMove pendingBlock) {
      return pendings.remove(pendingBlock);
    }
+
+    void setHasFailure() {
+      this.hasFailure = true;
+    }
  }

  /** A node that can be the sources of a block move */
-  class Source extends DDatanode.StorageGroup {
+  public class Source extends DDatanode.StorageGroup {

    private final List<Task> tasks = new ArrayList<Task>(2);
    private long blocksToReceive = 0L;
@ -576,8 +641,11 @@ public class Dispatcher {

    /** Decide if the given block is a good candidate to move or not */
    private boolean isGoodBlockCandidate(DBlock block) {
+      // source and target must have the same storage type
+      final StorageType sourceStorageType = getStorageType();
      for (Task t : tasks) {
-        if (Dispatcher.this.isGoodBlockCandidate(this, t.target, block)) {
+        if (Dispatcher.this.isGoodBlockCandidate(this, t.target,
+            sourceStorageType, block)) {
          return true;
        }
      }
@ -597,11 +665,9 @@ public class Dispatcher {
      for (Iterator<Task> i = tasks.iterator(); i.hasNext();) {
        final Task task = i.next();
        final DDatanode target = task.target.getDDatanode();
-        PendingMove pendingBlock = new PendingMove();
+        final PendingMove pendingBlock = new PendingMove(this, task.target);
        if (target.addPendingBlock(pendingBlock)) {
          // target is not busy, so do a tentative block allocation
-          pendingBlock.source = this;
-          pendingBlock.target = task.target;
          if (pendingBlock.chooseBlockAndProxy()) {
            long blockSize = pendingBlock.block.getNumBytes();
            incScheduledSize(-blockSize);
@ -618,6 +684,11 @@ public class Dispatcher {
      }
      return null;
    }
+    
+    /** Add a pending move */
+    public PendingMove addPendingMove(DBlock block, StorageGroup target) {
+      return target.addPendingMove(block, new PendingMove(this, target));
+    }

    /** Iterate all source's blocks to remove moved ones */
    private void removeMovedBlocks() {
@ -656,13 +727,7 @@ public class Dispatcher {
        if (p != null) {
          // Reset no pending move counter
          noPendingMoveIteration=0;
-          // move the block
-          moveExecutor.execute(new Runnable() {
-            @Override
-            public void run() {
-              p.dispatch();
-            }
-          });
+          executePendingMove(p);
          continue;
        }

@ -718,7 +783,8 @@ public class Dispatcher {
    this.cluster = NetworkTopology.getInstance(conf);

    this.moveExecutor = Executors.newFixedThreadPool(moverThreads);
-    this.dispatchExecutor = Executors.newFixedThreadPool(dispatcherThreads);
+    this.dispatchExecutor = dispatcherThreads == 0? null
+        : Executors.newFixedThreadPool(dispatcherThreads);
    this.maxConcurrentMovesPerNode = maxConcurrentMovesPerNode;

    final boolean fallbackToSimpleAuthAllowed = conf.getBoolean(
@ -729,11 +795,15 @@ public class Dispatcher {
        TrustedChannelResolver.getInstance(conf), fallbackToSimpleAuthAllowed);
  }

-  StorageGroupMap getStorageGroupMap() {
+  public DistributedFileSystem getDistributedFileSystem() {
+    return nnc.getDistributedFileSystem();
+  }
+
+  public StorageGroupMap<StorageGroup> getStorageGroupMap() {
    return storageGroupMap;
  }

-  NetworkTopology getCluster() {
+  public NetworkTopology getCluster() {
    return cluster;
  }
  
@ -781,7 +851,7 @@ public class Dispatcher {
  }

  /** Get live datanode storage reports and then build the network topology. */
-  List<DatanodeStorageReport> init() throws IOException {
+  public List<DatanodeStorageReport> init() throws IOException {
    final DatanodeStorageReport[] reports = nnc.getLiveDatanodeStorageReport();
    final List<DatanodeStorageReport> trimmed = new ArrayList<DatanodeStorageReport>(); 
    // create network topology and classify utilization collections:
@ -797,8 +867,18 @@ public class Dispatcher {
    return trimmed;
  }

-  public DDatanode newDatanode(DatanodeStorageReport r) {
-    return new DDatanode(r, maxConcurrentMovesPerNode);
+  public DDatanode newDatanode(DatanodeInfo datanode) {
+    return new DDatanode(datanode, maxConcurrentMovesPerNode);
+  }
+
+  public void executePendingMove(final PendingMove p) {
+    // move the block
+    moveExecutor.execute(new Runnable() {
+      @Override
+      public void run() {
+        p.dispatch();
+      }
+    });
  }

  public boolean dispatchAndCheckContinue() throws InterruptedException {
@ -838,7 +918,7 @@ public class Dispatcher {
    }

    // wait for all block moving to be done
-    waitForMoveCompletion();
+    waitForMoveCompletion(targets);

    return bytesMoved.get() - bytesLastMoved;
  }
@ -846,23 +926,25 @@ public class Dispatcher {
  /** The sleeping period before checking if block move is completed again */
  static private long blockMoveWaitTime = 30000L;

-  /** set the sleeping period for block move completion check */
-  static void setBlockMoveWaitTime(long time) {
-    blockMoveWaitTime = time;
-  }
-
-  /** Wait for all block move confirmations. */
-  private void waitForMoveCompletion() {
+  /**
+   * Wait for all block move confirmations.
+   * @return true if there is failed move execution
+   */
+  public static boolean waitForMoveCompletion(
+      Iterable<? extends StorageGroup> targets) {
+    boolean hasFailure = false;
    for(;;) {
      boolean empty = true;
      for (StorageGroup t : targets) {
        if (!t.getDDatanode().isPendingQEmpty()) {
          empty = false;
          break;
+        } else {
+          hasFailure |= t.getDDatanode().hasFailure;
        }
      }
      if (empty) {
-        return; //all pending queues are empty
+        return hasFailure; // all pending queues are empty
      }
      try {
        Thread.sleep(blockMoveWaitTime);
@ -873,14 +955,14 @@ public class Dispatcher {

  /**
   * Decide if the block is a good candidate to be moved from source to target.
-   * A block is a good candidate if 
+   * A block is a good candidate if
   * 1. the block is not in the process of being moved/has not been moved;
   * 2. the block does not have a replica on the target;
   * 3. doing the move does not reduce the number of racks that the block has
   */
-  private boolean isGoodBlockCandidate(Source source, StorageGroup target,
-      DBlock block) {
-    if (source.storageType != target.storageType) {
+  private boolean isGoodBlockCandidate(StorageGroup source, StorageGroup target,
+      StorageType targetStorageType, DBlock block) {
+    if (target.storageType != targetStorageType) {
      return false;
    }
    // check if the block is moved or not
@ -891,7 +973,7 @@ public class Dispatcher {
      return false;
    }
    if (cluster.isNodeGroupAware()
-        && isOnSameNodeGroupWithReplicas(target, block, source)) {
+        && isOnSameNodeGroupWithReplicas(source, target, block)) {
      return false;
    }
    if (reduceNumOfRacks(source, target, block)) {
@ -904,7 +986,7 @@ public class Dispatcher {
   * Determine whether moving the given block replica from source to target
   * would reduce the number of racks of the block replicas.
   */
-  private boolean reduceNumOfRacks(Source source, StorageGroup target,
+  private boolean reduceNumOfRacks(StorageGroup source, StorageGroup target,
      DBlock block) {
    final DatanodeInfo sourceDn = source.getDatanodeInfo();
    if (cluster.isOnSameRack(sourceDn, target.getDatanodeInfo())) {
@ -937,12 +1019,12 @@ public class Dispatcher {
   * Check if there are any replica (other than source) on the same node group
   * with target. If true, then target is not a good candidate for placing
   * specific replica as we don't want 2 replicas under the same nodegroup.
-   * 
+   *
   * @return true if there are any replica (other than source) on the same node
   *         group with target
   */
-  private boolean isOnSameNodeGroupWithReplicas(
-      StorageGroup target, DBlock block, Source source) {
+  private boolean isOnSameNodeGroupWithReplicas(StorageGroup source,
+      StorageGroup target, DBlock block) {
    final DatanodeInfo targetDn = target.getDatanodeInfo();
    for (StorageGroup g : block.getLocations()) {
      if (g != source && cluster.isOnSameNodeGroup(g.getDatanodeInfo(), targetDn)) {
@ -962,9 +1044,22 @@ public class Dispatcher {
    movedBlocks.cleanup();
  }

+  /** set the sleeping period for block move completion check */
+  @VisibleForTesting
+  public static void setBlockMoveWaitTime(long time) {
+    blockMoveWaitTime = time;
+  }
+
+  @VisibleForTesting
+  public static void setDelayAfterErrors(long time) {
+    delayAfterErrors = time;
+  }
+
  /** shutdown thread pools */
-  void shutdownNow() {
-    dispatchExecutor.shutdownNow();
+  public void shutdownNow() {
+    if (dispatchExecutor != null) {
+      dispatchExecutor.shutdownNow();
+    }
    moveExecutor.shutdownNow();
  }

--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Matcher.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Matcher.java
@ -31,6 +31,11 @@ public interface Matcher {
    public boolean match(NetworkTopology cluster, Node left, Node right) {
      return cluster.isOnSameNodeGroup(left, right);
    }
+
+    @Override
+    public String toString() {
+      return "SAME_NODE_GROUP";
+    }
  };

  /** Match datanodes in the same rack. */
@ -39,6 +44,11 @@ public interface Matcher {
    public boolean match(NetworkTopology cluster, Node left, Node right) {
      return cluster.isOnSameRack(left, right);
    }
+
+    @Override
+    public String toString() {
+      return "SAME_RACK";
+    }
  };

  /** Match any datanode with any other datanode. */
@ -47,5 +57,10 @@ public interface Matcher {
    public boolean match(NetworkTopology cluster, Node left, Node right) {
      return left != right;
    }
+
+    @Override
+    public String toString() {
+      return "ANY_OTHER";
+    }
  };
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/MovedBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/MovedBlocks.java
@ -40,7 +40,7 @@ public class MovedBlocks<L> {
  public static class Locations<L> {
    private final Block block; // the block
    /** The locations of the replicas of the block. */
-    private final List<L> locations = new ArrayList<L>(3);
+    protected final List<L> locations = new ArrayList<L>(3);
    
    public Locations(Block block) {
      this.block = block;
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java
@ -18,19 +18,25 @@
 package org.apache.hadoop.hdfs.server.balancer;

 import java.io.Closeable;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.net.InetAddress;
 import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FsServerDefaults;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.hdfs.NameNodeProxies;
 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
 import org.apache.hadoop.hdfs.protocol.ClientProtocol;
@ -43,6 +49,8 @@ import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.ipc.RemoteException;

+import com.google.common.annotations.VisibleForTesting;
+
 /**
 * The class provides utilities for accessing a NameNode.
 */
@ -51,6 +59,41 @@ public class NameNodeConnector implements Closeable {
  private static final Log LOG = LogFactory.getLog(NameNodeConnector.class);

  private static final int MAX_NOT_CHANGED_ITERATIONS = 5;
+  private static boolean write2IdFile = true;
+  
+  /** Create {@link NameNodeConnector} for the given namenodes. */
+  public static List<NameNodeConnector> newNameNodeConnectors(
+      Collection<URI> namenodes, String name, Path idPath, Configuration conf)
+      throws IOException {
+    final List<NameNodeConnector> connectors = new ArrayList<NameNodeConnector>(
+        namenodes.size());
+    for (URI uri : namenodes) {
+      NameNodeConnector nnc = new NameNodeConnector(name, uri, idPath,
+          null, conf);
+      nnc.getKeyManager().startBlockKeyUpdater();
+      connectors.add(nnc);
+    }
+    return connectors;
+  }
+
+  public static List<NameNodeConnector> newNameNodeConnectors(
+      Map<URI, List<Path>> namenodes, String name, Path idPath,
+      Configuration conf) throws IOException {
+    final List<NameNodeConnector> connectors = new ArrayList<NameNodeConnector>(
+        namenodes.size());
+    for (Map.Entry<URI, List<Path>> entry : namenodes.entrySet()) {
+      NameNodeConnector nnc = new NameNodeConnector(name, entry.getKey(),
+          idPath, entry.getValue(), conf);
+      nnc.getKeyManager().startBlockKeyUpdater();
+      connectors.add(nnc);
+    }
+    return connectors;
+  }
+
+  @VisibleForTesting
+  public static void setWrite2IdFile(boolean write2IdFile) {
+    NameNodeConnector.write2IdFile = write2IdFile;
+  }

  private final URI nameNodeUri;
  private final String blockpoolID;
@ -59,22 +102,26 @@ public class NameNodeConnector implements Closeable {
  private final ClientProtocol client;
  private final KeyManager keyManager;

-  private final FileSystem fs;
+  private final DistributedFileSystem fs;
  private final Path idPath;
  private final OutputStream out;
+  private final List<Path> targetPaths;

  private int notChangedIterations = 0;

  public NameNodeConnector(String name, URI nameNodeUri, Path idPath,
-      Configuration conf) throws IOException {
+                           List<Path> targetPaths, Configuration conf)
+      throws IOException {
    this.nameNodeUri = nameNodeUri;
    this.idPath = idPath;
-    
+    this.targetPaths = targetPaths == null || targetPaths.isEmpty() ? Arrays
+        .asList(new Path("/")) : targetPaths;
+
    this.namenode = NameNodeProxies.createProxy(conf, nameNodeUri,
        NamenodeProtocol.class).getProxy();
    this.client = NameNodeProxies.createProxy(conf, nameNodeUri,
        ClientProtocol.class).getProxy();
-    this.fs = FileSystem.get(nameNodeUri, conf);
+    this.fs = (DistributedFileSystem)FileSystem.get(nameNodeUri, conf);

    final NamespaceInfo namespaceinfo = namenode.versionRequest();
    this.blockpoolID = namespaceinfo.getBlockPoolID();
@ -82,13 +129,18 @@ public class NameNodeConnector implements Closeable {
    final FsServerDefaults defaults = fs.getServerDefaults(new Path("/"));
    this.keyManager = new KeyManager(blockpoolID, namenode,
        defaults.getEncryptDataTransfer(), conf);
-    // Exit if there is another one running.
-    out = checkAndMarkRunning(); 
+    // if it is for test, we do not create the id file
+    out = checkAndMarkRunning();
    if (out == null) {
+      // Exit if there is another one running.
      throw new IOException("Another " + name + " is running.");
    }
  }

+  public DistributedFileSystem getDistributedFileSystem() {
+    return fs;
+  }
+
  /** @return the block pool ID */
  public String getBlockpoolID() {
    return blockpoolID;
@ -111,6 +163,11 @@ public class NameNodeConnector implements Closeable {
    return keyManager;
  }

+  /** @return the list of paths to scan/migrate */
+  public List<Path> getTargetPaths() {
+    return targetPaths;
+  }
+
  /** Should the instance continue running? */
  public boolean shouldContinue(long dispatchBlockMoveBytes) {
    if (dispatchBlockMoveBytes > 0) {
@ -144,9 +201,11 @@ public class NameNodeConnector implements Closeable {
   */
  private OutputStream checkAndMarkRunning() throws IOException {
    try {
-      final DataOutputStream out = fs.create(idPath);
-      out.writeBytes(InetAddress.getLocalHost().getHostName());
-      out.flush();
+      final FSDataOutputStream out = fs.create(idPath);
+      if (write2IdFile) {
+        out.writeBytes(InetAddress.getLocalHost().getHostName());
+        out.hflush();
+      }
      return out;
    } catch(RemoteException e) {
      if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockCollection.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockCollection.java
@ -66,6 +66,11 @@ public interface BlockCollection {
   */
  public short getBlockReplication();

+  /** 
+   * @return the storage policy ID.
+   */
+  public byte getStoragePolicyID();
+
  /**
   * Get the name of the collection.
   */
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@ -42,6 +42,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.HadoopIllegalArgumentException;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.HAUtil;
@ -254,6 +255,7 @@ public class BlockManager {

  /** for block replicas placement */
  private BlockPlacementPolicy blockplacement;
+  private final BlockStoragePolicy.Suite storagePolicySuite;

  /** Check whether name system is running before terminating */
  private boolean checkNSRunning = true;
@ -276,6 +278,7 @@ public class BlockManager {
    blockplacement = BlockPlacementPolicy.getInstance(
        conf, stats, datanodeManager.getNetworkTopology(), 
        datanodeManager.getHost2DatanodeMap());
+    storagePolicySuite = BlockStoragePolicy.readBlockStorageSuite(conf);
    pendingReplications = new PendingReplicationBlocks(conf.getInt(
      DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
      DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_DEFAULT) * 1000L);
@ -394,7 +397,11 @@ public class BlockManager {
          lifetimeMin*60*1000L, 0, null, encryptionAlgorithm);
    }
  }
-  
+
+  public BlockStoragePolicy getStoragePolicy(final String policyName) {
+    return storagePolicySuite.getPolicy(policyName);
+  }
+
  public void setBlockPoolId(String blockPoolId) {
    if (isBlockTokenEnabled()) {
      blockTokenSecretManager.setBlockPoolId(blockPoolId);
@ -445,7 +452,7 @@ public class BlockManager {
    return datanodeManager;
  }

-  /** @return the BlockPlacementPolicy */
+  @VisibleForTesting
  public BlockPlacementPolicy getBlockPlacementPolicy() {
    return blockplacement;
  }
@ -1366,7 +1373,7 @@ public class BlockManager {
      // choose replication targets: NOT HOLDING THE GLOBAL LOCK
      // It is costly to extract the filename for which chooseTargets is called,
      // so for now we pass in the block collection itself.
-      rw.chooseTargets(blockplacement, excludedNodes);
+      rw.chooseTargets(blockplacement, storagePolicySuite, excludedNodes);
    }

    namesystem.writeLock();
@ -1467,24 +1474,48 @@ public class BlockManager {
    return scheduledWork;
  }

+  /** Choose target for WebHDFS redirection. */
+  public DatanodeStorageInfo[] chooseTarget4WebHDFS(String src,
+      DatanodeDescriptor clientnode, Set<Node> excludes, long blocksize) {
+    return blockplacement.chooseTarget(src, 1, clientnode,
+        Collections.<DatanodeStorageInfo>emptyList(), false, excludes,
+        blocksize, storagePolicySuite.getDefaultPolicy());
+  }
+
+  /** Choose target for getting additional datanodes for an existing pipeline. */
+  public DatanodeStorageInfo[] chooseTarget4AdditionalDatanode(String src,
+      int numAdditionalNodes,
+      DatanodeDescriptor clientnode,
+      List<DatanodeStorageInfo> chosen,
+      Set<Node> excludes,
+      long blocksize,
+      byte storagePolicyID) {
+    
+    final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(storagePolicyID);
+    return blockplacement.chooseTarget(src, numAdditionalNodes, clientnode,
+        chosen, true, excludes, blocksize, storagePolicy);
+  }
+
  /**
-   * Choose target datanodes according to the replication policy.
+   * Choose target datanodes for creating a new block.
   * 
   * @throws IOException
   *           if the number of targets < minimum replication.
   * @see BlockPlacementPolicy#chooseTarget(String, int, Node,
-   *      List, boolean, Set, long, StorageType)
+   *      Set, long, List, BlockStoragePolicy)
   */
-  public DatanodeStorageInfo[] chooseTarget(final String src,
+  public DatanodeStorageInfo[] chooseTarget4NewBlock(final String src,
      final int numOfReplicas, final DatanodeDescriptor client,
      final Set<Node> excludedNodes,
-      final long blocksize, List<String> favoredNodes) throws IOException {
+      final long blocksize,
+      final List<String> favoredNodes,
+      final byte storagePolicyID) throws IOException {
    List<DatanodeDescriptor> favoredDatanodeDescriptors = 
        getDatanodeDescriptors(favoredNodes);
+    final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(storagePolicyID);
    final DatanodeStorageInfo[] targets = blockplacement.chooseTarget(src,
        numOfReplicas, client, excludedNodes, blocksize, 
-        // TODO: get storage type from file
-        favoredDatanodeDescriptors, StorageType.DEFAULT);
+        favoredDatanodeDescriptors, storagePolicy);
    if (targets.length < minReplication) {
      throw new IOException("File " + src + " could only be replicated to "
          + targets.length + " nodes instead of minReplication (="
@ -2716,6 +2747,10 @@ public class BlockManager {
    assert namesystem.hasWriteLock();
    // first form a rack to datanodes map and
    BlockCollection bc = getBlockCollection(b);
+    final BlockStoragePolicy storagePolicy = storagePolicySuite.getPolicy(bc.getStoragePolicyID());
+    final List<StorageType> excessTypes = storagePolicy.chooseExcess(
+        replication, DatanodeStorageInfo.toStorageTypes(nonExcess));
+

    final Map<String, List<DatanodeStorageInfo>> rackMap
        = new HashMap<String, List<DatanodeStorageInfo>>();
@ -2736,16 +2771,13 @@ public class BlockManager {
    final DatanodeStorageInfo addedNodeStorage
        = DatanodeStorageInfo.getDatanodeStorageInfo(nonExcess, addedNode);
    while (nonExcess.size() - replication > 0) {
-      // check if we can delete delNodeHint
      final DatanodeStorageInfo cur;
-      if (firstOne && delNodeHintStorage != null
-          && (moreThanOne.contains(delNodeHintStorage)
-              || (addedNodeStorage != null
-                  && !moreThanOne.contains(addedNodeStorage)))) {
+      if (useDelHint(firstOne, delNodeHintStorage, addedNodeStorage,
+          moreThanOne, excessTypes)) {
        cur = delNodeHintStorage;
      } else { // regular excessive replica removal
        cur = replicator.chooseReplicaToDelete(bc, b, replication,
-        		moreThanOne, exactlyOne);
+            moreThanOne, exactlyOne, excessTypes);
      }
      firstOne = false;

@ -2771,6 +2803,27 @@ public class BlockManager {
    }
  }

+  /** Check if we can use delHint */
+  static boolean useDelHint(boolean isFirst, DatanodeStorageInfo delHint,
+      DatanodeStorageInfo added, List<DatanodeStorageInfo> moreThan1Racks,
+      List<StorageType> excessTypes) {
+    if (!isFirst) {
+      return false; // only consider delHint for the first case
+    } else if (delHint == null) {
+      return false; // no delHint
+    } else if (!excessTypes.contains(delHint.getStorageType())) {
+      return false; // delHint storage type is not an excess type
+    } else {
+      // check if removing delHint reduces the number of racks
+      if (moreThan1Racks.contains(delHint)) {
+        return true; // delHint and some other nodes are under the same rack 
+      } else if (added != null && !moreThan1Racks.contains(added)) {
+        return true; // the added node adds a new rack
+      }
+      return false; // removing delHint reduces the number of racks;
+    }
+  }
+
  private void addToExcessReplicate(DatanodeInfo dn, Block block) {
    assert namesystem.hasWriteLock();
    LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(dn.getDatanodeUuid());
@ -2877,7 +2930,7 @@ public class BlockManager {
    // Decrement number of blocks scheduled to this datanode.
    // for a retry request (of DatanodeProtocol#blockReceivedAndDeleted with 
    // RECEIVED_BLOCK), we currently also decrease the approximate number. 
-    node.decrementBlocksScheduled();
+    node.decrementBlocksScheduled(storageInfo.getStorageType());

    // get the deletion hint node
    DatanodeDescriptor delHintNode = null;
@ -3546,10 +3599,12 @@ public class BlockManager {
    }
    
    private void chooseTargets(BlockPlacementPolicy blockplacement,
+        BlockStoragePolicy.Suite storagePolicySuite,
        Set<Node> excludedNodes) {
      targets = blockplacement.chooseTarget(bc.getName(),
          additionalReplRequired, srcNode, liveReplicaStorages, false,
-          excludedNodes, block.getNumBytes(), StorageType.DEFAULT);
+          excludedNodes, block.getNumBytes(),
+          storagePolicySuite.getPolicy(bc.getStoragePolicyID()));
    }
  }

--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicy.java
@ -27,6 +27,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.StorageType;
 import org.apache.hadoop.hdfs.protocol.Block;
@ -75,7 +76,7 @@ public abstract class BlockPlacementPolicy {
                                             boolean returnChosenNodes,
                                             Set<Node> excludedNodes,
                                             long blocksize,
-                                             StorageType storageType);
+                                             BlockStoragePolicy storagePolicy);
  
  /**
   * Same as {@link #chooseTarget(String, int, Node, Set, long, List, StorageType)}
@ -89,14 +90,14 @@ public abstract class BlockPlacementPolicy {
      Set<Node> excludedNodes,
      long blocksize,
      List<DatanodeDescriptor> favoredNodes,
-      StorageType storageType) {
+      BlockStoragePolicy storagePolicy) {
    // This class does not provide the functionality of placing
    // a block in favored datanodes. The implementations of this class
    // are expected to provide this functionality

    return chooseTarget(src, numOfReplicas, writer, 
        new ArrayList<DatanodeStorageInfo>(numOfReplicas), false,
-        excludedNodes, blocksize, storageType);
+        excludedNodes, blocksize, storagePolicy);
  }

  /**
@ -118,18 +119,21 @@ public abstract class BlockPlacementPolicy {
   * @param srcBC block collection of file to which block-to-be-deleted belongs
   * @param block The block to be deleted
   * @param replicationFactor The required number of replicas for this block
-   * @param existingReplicas The replica locations of this block that are present
-                  on at least two unique racks. 
-   * @param moreExistingReplicas Replica locations of this block that are not
-                   listed in the previous parameter.
+   * @param moreThanOne The replica locations of this block that are present
+   *                    on more than one unique racks.
+   * @param exactlyOne Replica locations of this block that  are present
+   *                    on exactly one unique racks.
+   * @param excessTypes The excess {@link StorageType}s according to the
+   *                    {@link BlockStoragePolicy}.
   * @return the replica that is the best candidate for deletion
   */
  abstract public DatanodeStorageInfo chooseReplicaToDelete(
      BlockCollection srcBC,
      Block block, 
      short replicationFactor,
-      Collection<DatanodeStorageInfo> existingReplicas,
-      Collection<DatanodeStorageInfo> moreExistingReplicas);
+      Collection<DatanodeStorageInfo> moreThanOne,
+      Collection<DatanodeStorageInfo> exactlyOne,
+      List<StorageType> excessTypes);

  /**
   * Used to setup a BlockPlacementPolicy object. This should be defined by 
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
@ -19,15 +19,11 @@ package org.apache.hadoop.hdfs.server.blockmanagement;

 import static org.apache.hadoop.util.Time.now;

-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
+import java.util.*;

 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.StorageType;
@ -80,12 +76,6 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
   */
  protected int tolerateHeartbeatMultiplier;

-  protected BlockPlacementPolicyDefault(Configuration conf, FSClusterStats stats,
-                           NetworkTopology clusterMap, 
-                           Host2NodesMap host2datanodeMap) {
-    initialize(conf, stats, clusterMap, host2datanodeMap);
-  }
-
  protected BlockPlacementPolicyDefault() {
  }
    
@ -117,9 +107,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
                                    boolean returnChosenNodes,
                                    Set<Node> excludedNodes,
                                    long blocksize,
-                                    StorageType storageType) {
+                                    final BlockStoragePolicy storagePolicy) {
    return chooseTarget(numOfReplicas, writer, chosenNodes, returnChosenNodes,
-        excludedNodes, blocksize, storageType);
+        excludedNodes, blocksize, storagePolicy);
  }

  @Override
@ -129,17 +119,21 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      Set<Node> excludedNodes,
      long blocksize,
      List<DatanodeDescriptor> favoredNodes,
-      StorageType storageType) {
+      BlockStoragePolicy storagePolicy) {
    try {
      if (favoredNodes == null || favoredNodes.size() == 0) {
        // Favored nodes not specified, fall back to regular block placement.
        return chooseTarget(src, numOfReplicas, writer,
            new ArrayList<DatanodeStorageInfo>(numOfReplicas), false, 
-            excludedNodes, blocksize, storageType);
+            excludedNodes, blocksize, storagePolicy);
      }

      Set<Node> favoriteAndExcludedNodes = excludedNodes == null ?
          new HashSet<Node>() : new HashSet<Node>(excludedNodes);
+      final List<StorageType> requiredStorageTypes = storagePolicy
+          .chooseStorageTypes((short)numOfReplicas);
+      final EnumMap<StorageType, Integer> storageTypes =
+          getRequiredStorageTypes(requiredStorageTypes);

      // Choose favored nodes
      List<DatanodeStorageInfo> results = new ArrayList<DatanodeStorageInfo>();
@ -152,7 +146,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
        final DatanodeStorageInfo target = chooseLocalStorage(favoredNode,
            favoriteAndExcludedNodes, blocksize, 
            getMaxNodesPerRack(results.size(), numOfReplicas)[1],
-            results, avoidStaleNodes, storageType, false);
+            results, avoidStaleNodes, storageTypes, false);
        if (target == null) {
          LOG.warn("Could not find a target for file " + src
              + " with favored node " + favoredNode); 
@ -166,7 +160,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
        numOfReplicas -= results.size();
        DatanodeStorageInfo[] remainingTargets = 
            chooseTarget(src, numOfReplicas, writer, results,
-                false, favoriteAndExcludedNodes, blocksize, storageType);
+                false, favoriteAndExcludedNodes, blocksize, storagePolicy);
        for (int i = 0; i < remainingTargets.length; i++) {
          results.add(remainingTargets[i]);
        }
@ -174,10 +168,14 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      return getPipeline(writer,
          results.toArray(new DatanodeStorageInfo[results.size()]));
    } catch (NotEnoughReplicasException nr) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Failed to choose with favored nodes (=" + favoredNodes
+            + "), disregard favored nodes hint and retry.", nr);
+      }
      // Fall back to regular block placement disregarding favored nodes hint
      return chooseTarget(src, numOfReplicas, writer, 
          new ArrayList<DatanodeStorageInfo>(numOfReplicas), false, 
-          excludedNodes, blocksize, storageType);
+          excludedNodes, blocksize, storagePolicy);
    }
  }

@ -188,7 +186,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
                                    boolean returnChosenNodes,
                                    Set<Node> excludedNodes,
                                    long blocksize,
-                                    StorageType storageType) {
+                                    final BlockStoragePolicy storagePolicy) {
    if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
      return DatanodeStorageInfo.EMPTY_ARRAY;
    }
@ -213,8 +211,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      
    boolean avoidStaleNodes = (stats != null
        && stats.isAvoidingStaleDataNodesForWrite());
-    Node localNode = chooseTarget(numOfReplicas, writer,
-        excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
+    final Node localNode = chooseTarget(numOfReplicas, writer, excludedNodes,
+        blocksize, maxNodesPerRack, results, avoidStaleNodes, storagePolicy,
+        EnumSet.noneOf(StorageType.class), results.isEmpty());
    if (!returnChosenNodes) {  
      results.removeAll(chosenStorage);
    }
@ -234,7 +233,22 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
    int maxNodesPerRack = (totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;
    return new int[] {numOfReplicas, maxNodesPerRack};
  }
-    
+
+  private EnumMap<StorageType, Integer> getRequiredStorageTypes(
+      List<StorageType> types) {
+    EnumMap<StorageType, Integer> map = new EnumMap<StorageType,
+        Integer>(StorageType.class);
+    for (StorageType type : types) {
+      if (!map.containsKey(type)) {
+        map.put(type, 1);
+      } else {
+        int num = map.get(type);
+        map.put(type, num + 1);
+      }
+    }
+    return map;
+  }
+
  /**
   * choose <i>numOfReplicas</i> from all data nodes
   * @param numOfReplicas additional number of replicas wanted
@ -247,31 +261,49 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
   * @return local node of writer (not chosen node)
   */
  private Node chooseTarget(int numOfReplicas,
-                                          Node writer,
-                                          Set<Node> excludedNodes,
-                                          long blocksize,
-                                          int maxNodesPerRack,
-                                          List<DatanodeStorageInfo> results,
-                                          final boolean avoidStaleNodes,
-                                          StorageType storageType) {
+                            Node writer,
+                            final Set<Node> excludedNodes,
+                            final long blocksize,
+                            final int maxNodesPerRack,
+                            final List<DatanodeStorageInfo> results,
+                            final boolean avoidStaleNodes,
+                            final BlockStoragePolicy storagePolicy,
+                            final EnumSet<StorageType> unavailableStorages,
+                            final boolean newBlock) {
    if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
      return writer;
    }
-    int totalReplicasExpected = numOfReplicas + results.size();
-      
-    int numOfResults = results.size();
-    boolean newBlock = (numOfResults==0);
+    final int numOfResults = results.size();
+    final int totalReplicasExpected = numOfReplicas + numOfResults;
    if ((writer == null || !(writer instanceof DatanodeDescriptor)) && !newBlock) {
      writer = results.get(0).getDatanodeDescriptor();
    }

    // Keep a copy of original excludedNodes
-    final Set<Node> oldExcludedNodes = avoidStaleNodes ? 
-        new HashSet<Node>(excludedNodes) : null;
+    final Set<Node> oldExcludedNodes = new HashSet<Node>(excludedNodes);
+
+    // choose storage types; use fallbacks for unavailable storages
+    final List<StorageType> requiredStorageTypes = storagePolicy
+        .chooseStorageTypes((short) totalReplicasExpected,
+            DatanodeStorageInfo.toStorageTypes(results),
+            unavailableStorages, newBlock);
+    final EnumMap<StorageType, Integer> storageTypes =
+        getRequiredStorageTypes(requiredStorageTypes);
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("storageTypes=" + storageTypes);
+    }
+
    try {
+      if ((numOfReplicas = requiredStorageTypes.size()) == 0) {
+        throw new NotEnoughReplicasException(
+            "All required storage types are unavailable: "
+            + " unavailableStorages=" + unavailableStorages
+            + ", storagePolicy=" + storagePolicy);
+      }
+
      if (numOfResults == 0) {
        writer = chooseLocalStorage(writer, excludedNodes, blocksize,
-            maxNodesPerRack, results, avoidStaleNodes, storageType, true)
+            maxNodesPerRack, results, avoidStaleNodes, storageTypes, true)
                .getDatanodeDescriptor();
        if (--numOfReplicas == 0) {
          return writer;
@ -280,7 +312,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      final DatanodeDescriptor dn0 = results.get(0).getDatanodeDescriptor();
      if (numOfResults <= 1) {
        chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
-            results, avoidStaleNodes, storageType);
+            results, avoidStaleNodes, storageTypes);
        if (--numOfReplicas == 0) {
          return writer;
        }
@ -289,24 +321,28 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
        final DatanodeDescriptor dn1 = results.get(1).getDatanodeDescriptor();
        if (clusterMap.isOnSameRack(dn0, dn1)) {
          chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
-              results, avoidStaleNodes, storageType);
+              results, avoidStaleNodes, storageTypes);
        } else if (newBlock){
          chooseLocalRack(dn1, excludedNodes, blocksize, maxNodesPerRack,
-              results, avoidStaleNodes, storageType);
+              results, avoidStaleNodes, storageTypes);
        } else {
          chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack,
-              results, avoidStaleNodes, storageType);
+              results, avoidStaleNodes, storageTypes);
        }
        if (--numOfReplicas == 0) {
          return writer;
        }
      }
      chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize,
-          maxNodesPerRack, results, avoidStaleNodes, storageType);
+          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    } catch (NotEnoughReplicasException e) {
      final String message = "Failed to place enough replicas, still in need of "
          + (totalReplicasExpected - results.size()) + " to reach "
-          + totalReplicasExpected + ".";
+          + totalReplicasExpected
+          + " (unavailableStorages=" + unavailableStorages
+          + ", storagePolicy=" + storagePolicy
+          + ", newBlock=" + newBlock + ")";
+
      if (LOG.isTraceEnabled()) {
        LOG.trace(message, e);
      } else {
@ -327,7 +363,28 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
        // if the NotEnoughReplicasException was thrown in chooseRandom().
        numOfReplicas = totalReplicasExpected - results.size();
        return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize,
-            maxNodesPerRack, results, false, storageType);
+            maxNodesPerRack, results, false, storagePolicy, unavailableStorages,
+            newBlock);
+      }
+
+      boolean retry = false;
+      // simply add all the remaining types into unavailableStorages and give
+      // another try. No best effort is guaranteed here.
+      for (StorageType type : storageTypes.keySet()) {
+        if (!unavailableStorages.contains(type)) {
+          unavailableStorages.add(type);
+          retry = true;
+        }
+      }
+      if (retry) {
+        for (DatanodeStorageInfo resultStorage : results) {
+          addToExcludedNodes(resultStorage.getDatanodeDescriptor(),
+              oldExcludedNodes);
+        }
+        numOfReplicas = totalReplicasExpected - results.size();
+        return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize,
+            maxNodesPerRack, results, false, storagePolicy, unavailableStorages,
+            newBlock);
      }
    }
    return writer;
@ -340,28 +397,35 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
   * @return the chosen storage
   */
  protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
-                                             Set<Node> excludedNodes,
-                                             long blocksize,
-                                             int maxNodesPerRack,
-                                             List<DatanodeStorageInfo> results,
-                                             boolean avoidStaleNodes,
-                                             StorageType storageType,
-                                             boolean fallbackToLocalRack)
+      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
+      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
+      EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack)
      throws NotEnoughReplicasException {
    // if no local machine, randomly choose one node
    if (localMachine == null) {
      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-          maxNodesPerRack, results, avoidStaleNodes, storageType);
+          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    }
    if (preferLocalNode && localMachine instanceof DatanodeDescriptor) {
      DatanodeDescriptor localDatanode = (DatanodeDescriptor) localMachine;
      // otherwise try local machine first
      if (excludedNodes.add(localMachine)) { // was not in the excluded list
-        for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
-            localDatanode.getStorageInfos())) {
-          if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
-              maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
-            return localStorage;
+        for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
+            .entrySet().iterator(); iter.hasNext(); ) {
+          Map.Entry<StorageType, Integer> entry = iter.next();
+          for (DatanodeStorageInfo localStorage : DFSUtil.shuffle(
+              localDatanode.getStorageInfos())) {
+            StorageType type = entry.getKey();
+            if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
+                maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) {
+              int num = entry.getValue();
+              if (num == 1) {
+                iter.remove();
+              } else {
+                entry.setValue(num - 1);
+              }
+              return localStorage;
+            }
          }
        }
      } 
@ -372,7 +436,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
    }
    // try a node on local rack
    return chooseLocalRack(localMachine, excludedNodes, blocksize,
-        maxNodesPerRack, results, avoidStaleNodes, storageType);
+        maxNodesPerRack, results, avoidStaleNodes, storageTypes);
  }
  
  /**
@ -395,50 +459,71 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
   * @return the chosen node
   */
  protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
-                                             Set<Node> excludedNodes,
-                                             long blocksize,
-                                             int maxNodesPerRack,
-                                             List<DatanodeStorageInfo> results,
-                                             boolean avoidStaleNodes,
-                                             StorageType storageType)
+                                                Set<Node> excludedNodes,
+                                                long blocksize,
+                                                int maxNodesPerRack,
+                                                List<DatanodeStorageInfo> results,
+                                                boolean avoidStaleNodes,
+                                                EnumMap<StorageType, Integer> storageTypes)
      throws NotEnoughReplicasException {
    // no local machine, so choose a random machine
    if (localMachine == null) {
      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-          maxNodesPerRack, results, avoidStaleNodes, storageType);
+          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    }
+    final String localRack = localMachine.getNetworkLocation();
      
-    // choose one from the local rack
    try {
-      return chooseRandom(localMachine.getNetworkLocation(), excludedNodes,
-          blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
-    } catch (NotEnoughReplicasException e1) {
-      // find the second replica
-      DatanodeDescriptor newLocal=null;
+      // choose one from the local rack
+      return chooseRandom(localRack, excludedNodes,
+          blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
+    } catch (NotEnoughReplicasException e) {
+      // find the next replica and retry with its rack
      for(DatanodeStorageInfo resultStorage : results) {
        DatanodeDescriptor nextNode = resultStorage.getDatanodeDescriptor();
        if (nextNode != localMachine) {
-          newLocal = nextNode;
-          break;
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Failed to choose from local rack (location = " + localRack
+                + "), retry with the rack of the next replica (location = "
+                + nextNode.getNetworkLocation() + ")", e);
+          }
+          return chooseFromNextRack(nextNode, excludedNodes, blocksize,
+              maxNodesPerRack, results, avoidStaleNodes, storageTypes);
        }
      }
-      if (newLocal != null) {
-        try {
-          return chooseRandom(newLocal.getNetworkLocation(), excludedNodes,
-              blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
-        } catch(NotEnoughReplicasException e2) {
-          //otherwise randomly choose one from the network
-          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-              maxNodesPerRack, results, avoidStaleNodes, storageType);
-        }
-      } else {
-        //otherwise randomly choose one from the network
-        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-            maxNodesPerRack, results, avoidStaleNodes, storageType);
+
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Failed to choose from local rack (location = " + localRack
+            + "); the second replica is not found, retry choosing ramdomly", e);
      }
+      //the second replica is not found, randomly choose one from the network
+      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
+          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    }
  }
-    
+
+  private DatanodeStorageInfo chooseFromNextRack(Node next,
+      Set<Node> excludedNodes,
+      long blocksize,
+      int maxNodesPerRack,
+      List<DatanodeStorageInfo> results,
+      boolean avoidStaleNodes,
+      EnumMap<StorageType, Integer> storageTypes) throws NotEnoughReplicasException {
+    final String nextRack = next.getNetworkLocation();
+    try {
+      return chooseRandom(nextRack, excludedNodes, blocksize, maxNodesPerRack,
+          results, avoidStaleNodes, storageTypes);
+    } catch(NotEnoughReplicasException e) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Failed to choose from the next rack (location = " + nextRack
+            + "), retry choosing ramdomly", e);
+      }
+      //otherwise randomly choose one from the network
+      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
+          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
+    }
+  }
+
  /** 
   * Choose <i>numOfReplicas</i> nodes from the racks 
   * that <i>localMachine</i> is NOT on.
@ -453,18 +538,22 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
                                int maxReplicasPerRack,
                                List<DatanodeStorageInfo> results,
                                boolean avoidStaleNodes,
-                                StorageType storageType)
+                                EnumMap<StorageType, Integer> storageTypes)
                                    throws NotEnoughReplicasException {
    int oldNumOfReplicas = results.size();
    // randomly choose one node from remote racks
    try {
      chooseRandom(numOfReplicas, "~" + localMachine.getNetworkLocation(),
          excludedNodes, blocksize, maxReplicasPerRack, results,
-          avoidStaleNodes, storageType);
+          avoidStaleNodes, storageTypes);
    } catch (NotEnoughReplicasException e) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Failed to choose remote rack (location = ~"
+            + localMachine.getNetworkLocation() + "), fallback to local rack", e);
+      }
      chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas),
                   localMachine.getNetworkLocation(), excludedNodes, blocksize, 
-                   maxReplicasPerRack, results, avoidStaleNodes, storageType);
+                   maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
    }
  }

@ -478,10 +567,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      int maxNodesPerRack,
      List<DatanodeStorageInfo> results,
      boolean avoidStaleNodes,
-      StorageType storageType)
+      EnumMap<StorageType, Integer> storageTypes)
          throws NotEnoughReplicasException {
    return chooseRandom(1, scope, excludedNodes, blocksize, maxNodesPerRack,
-        results, avoidStaleNodes, storageType);
+        results, avoidStaleNodes, storageTypes);
  }

  /**
@ -495,8 +584,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
                            int maxNodesPerRack,
                            List<DatanodeStorageInfo> results,
                            boolean avoidStaleNodes,
-                            StorageType storageType)
-                                throws NotEnoughReplicasException {
+                            EnumMap<StorageType, Integer> storageTypes)
+                            throws NotEnoughReplicasException {
      
    int numOfAvailableNodes = clusterMap.countNumOfAvailableNodes(
        scope, excludedNodes);
@ -512,24 +601,43 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      DatanodeDescriptor chosenNode = 
          (DatanodeDescriptor)clusterMap.chooseRandom(scope);
      if (excludedNodes.add(chosenNode)) { //was not in the excluded list
+        if (LOG.isDebugEnabled()) {
+          builder.append("\nNode ").append(NodeBase.getPath(chosenNode)).append(" [");
+        }
        numOfAvailableNodes--;

        final DatanodeStorageInfo[] storages = DFSUtil.shuffle(
            chosenNode.getStorageInfos());
-        int i;
-        for(i = 0; i < storages.length; i++) {
-          final int newExcludedNodes = addIfIsGoodTarget(storages[i],
-              excludedNodes, blocksize, maxNodesPerRack, considerLoad, results,
-              avoidStaleNodes, storageType);
-          if (newExcludedNodes >= 0) {
-            numOfReplicas--;
-            if (firstChosen == null) {
-              firstChosen = storages[i];
+        int i = 0;
+        boolean search = true;
+        for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
+            .entrySet().iterator(); search && iter.hasNext(); ) {
+          Map.Entry<StorageType, Integer> entry = iter.next();
+          for (i = 0; i < storages.length; i++) {
+            StorageType type = entry.getKey();
+            final int newExcludedNodes = addIfIsGoodTarget(storages[i],
+                excludedNodes, blocksize, maxNodesPerRack, considerLoad, results,
+                avoidStaleNodes, type);
+            if (newExcludedNodes >= 0) {
+              numOfReplicas--;
+              if (firstChosen == null) {
+                firstChosen = storages[i];
+              }
+              numOfAvailableNodes -= newExcludedNodes;
+              int num = entry.getValue();
+              if (num == 1) {
+                iter.remove();
+              } else {
+                entry.setValue(num - 1);
+              }
+              search = false;
+              break;
            }
-            numOfAvailableNodes -= newExcludedNodes;
-            break;
          }
        }
+        if (LOG.isDebugEnabled()) {
+          builder.append("\n]");
+        }

        // If no candidate storage was found on this DN then set badTarget.
        badTarget = (i == storages.length);
@ -540,9 +648,11 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      String detail = enableDebugLogging;
      if (LOG.isDebugEnabled()) {
        if (badTarget && builder != null) {
-          detail = builder.append("]").toString();
+          detail = builder.toString();
          builder.setLength(0);
-        } else detail = "";
+        } else {
+          detail = "";
+        }
      }
      throw new NotEnoughReplicasException(detail);
    }
@ -576,14 +686,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {

  private static void logNodeIsNotChosen(DatanodeStorageInfo storage, String reason) {
    if (LOG.isDebugEnabled()) {
-      final DatanodeDescriptor node = storage.getDatanodeDescriptor();
      // build the error message for later use.
      debugLoggingBuilder.get()
-          .append(node).append(": ")
-          .append("Storage ").append(storage)
-          .append("at node ").append(NodeBase.getPath(node))
-          .append(" is not chosen because ")
-          .append(reason);
+          .append("\n  Storage ").append(storage)
+          .append(" is not chosen since ").append(reason).append(".");
    }
  }

@ -608,11 +714,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
                               boolean considerLoad,
                               List<DatanodeStorageInfo> results,
                               boolean avoidStaleNodes,
-                               StorageType storageType) {
-    if (storage.getStorageType() != storageType) {
-      logNodeIsNotChosen(storage,
-          "storage types do not match, where the expected storage type is "
-              + storageType);
+                               StorageType requiredStorageType) {
+    if (storage.getStorageType() != requiredStorageType) {
+      logNodeIsNotChosen(storage, "storage types do not match,"
+          + " where the required storage type is " + requiredStorageType);
      return false;
    }
    if (storage.getState() == State.READ_ONLY_SHARED) {
@ -634,9 +739,14 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
    }
    
    final long requiredSize = blockSize * HdfsConstants.MIN_BLOCKS_FOR_WRITE;
-    final long scheduledSize = blockSize * node.getBlocksScheduled();
-    if (requiredSize > storage.getRemaining() - scheduledSize) {
-      logNodeIsNotChosen(storage, "the node does not have enough space ");
+    final long scheduledSize = blockSize * node.getBlocksScheduled(storage.getStorageType());
+    final long remaining = node.getRemaining(storage.getStorageType());
+    if (requiredSize > remaining - scheduledSize) {
+      logNodeIsNotChosen(storage, "the node does not have enough "
+          + storage.getStorageType() + " space"
+          + " (required=" + requiredSize
+          + ", scheduled=" + scheduledSize
+          + ", remaining=" + remaining + ")");
      return false;
    }

@ -645,8 +755,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      final double maxLoad = 2.0 * stats.getInServiceXceiverAverage();
      final int nodeLoad = node.getXceiverCount();
      if (nodeLoad > maxLoad) {
-        logNodeIsNotChosen(storage,
-            "the node is too busy (load:"+nodeLoad+" > "+maxLoad+") ");
+        logNodeIsNotChosen(storage, "the node is too busy (load: " + nodeLoad
+            + " > " + maxLoad + ") ");
        return false;
      }
    }
@ -666,7 +776,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
    }
    return true;
  }
-    
+
  /**
   * Return a pipeline of nodes.
   * The pipeline is formed finding a shortest path that 
@ -732,7 +842,8 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
  public DatanodeStorageInfo chooseReplicaToDelete(BlockCollection bc,
      Block block, short replicationFactor,
      Collection<DatanodeStorageInfo> first,
-      Collection<DatanodeStorageInfo> second) {
+      Collection<DatanodeStorageInfo> second,
+      final List<StorageType> excessTypes) {
    long oldestHeartbeat =
      now() - heartbeatInterval * tolerateHeartbeatMultiplier;
    DatanodeStorageInfo oldestHeartbeatStorage = null;
@ -742,6 +853,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
    // Pick the node with the oldest heartbeat or with the least free space,
    // if all hearbeats are within the tolerable heartbeat interval
    for(DatanodeStorageInfo storage : pickupReplicaSet(first, second)) {
+      if (!excessTypes.contains(storage.getStorageType())) {
+        continue;
+      }
+
      final DatanodeDescriptor node = storage.getDatanodeDescriptor();
      long free = node.getRemaining();
      long lastHeartbeat = node.getLastUpdate();
@ -755,8 +870,16 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
      }
    }

-    return oldestHeartbeatStorage != null? oldestHeartbeatStorage
-        : minSpaceStorage;
+    final DatanodeStorageInfo storage;
+    if (oldestHeartbeatStorage != null) {
+      storage = oldestHeartbeatStorage;
+    } else if (minSpaceStorage != null) {
+      storage = minSpaceStorage;
+    } else {
+      return null;
+    }
+    excessTypes.remove(storage.getStorageType());
+    return storage;
  }

  /**
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyWithNodeGroup.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyWithNodeGroup.java
@ -17,12 +17,7 @@
 */
 package org.apache.hadoop.hdfs.server.blockmanagement;

-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.DFSUtil;
@ -70,22 +65,33 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
  protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
-      StorageType storageType, boolean fallbackToLocalRack
-      ) throws NotEnoughReplicasException {
+      EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack)
+      throws NotEnoughReplicasException {
    // if no local machine, randomly choose one node
    if (localMachine == null)
      return chooseRandom(NodeBase.ROOT, excludedNodes, 
-          blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
+          blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);

    // otherwise try local machine first
    if (localMachine instanceof DatanodeDescriptor) {
      DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
      if (excludedNodes.add(localMachine)) { // was not in the excluded list
-        for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
-            localDataNode.getStorageInfos())) {
-          if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
-              maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
-            return localStorage;
+        for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes
+            .entrySet().iterator(); iter.hasNext(); ) {
+          Map.Entry<StorageType, Integer> entry = iter.next();
+          for (DatanodeStorageInfo localStorage : DFSUtil.shuffle(
+              localDataNode.getStorageInfos())) {
+            StorageType type = entry.getKey();
+            if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
+                maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) {
+              int num = entry.getValue();
+              if (num == 1) {
+                iter.remove();
+              } else {
+                entry.setValue(num - 1);
+              }
+              return localStorage;
+            }
          }
        }
      }
@ -94,7 +100,7 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
    // try a node on local node group
    DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
        (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, 
-        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
+        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    if (chosenStorage != null) {
      return chosenStorage;
    }
@ -104,7 +110,7 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
    }
    // try a node on local rack
    return chooseLocalRack(localMachine, excludedNodes, 
-        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
+        blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes);
  }

  /** @return the node of the second replica */
@ -124,18 +130,19 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
  protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
-      StorageType storageType) throws NotEnoughReplicasException {
+      EnumMap<StorageType, Integer> storageTypes) throws
+      NotEnoughReplicasException {
    // no local machine, so choose a random machine
    if (localMachine == null) {
      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-          maxNodesPerRack, results, avoidStaleNodes, storageType);
+          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    }

    // choose one from the local rack, but off-nodegroup
    try {
      final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
      return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
-          results, avoidStaleNodes, storageType);
+          results, avoidStaleNodes, storageTypes);
    } catch (NotEnoughReplicasException e1) {
      // find the second replica
      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
@ -143,16 +150,17 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
        try {
          return chooseRandom(
              clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
-              blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
+              blocksize, maxNodesPerRack, results, avoidStaleNodes,
+              storageTypes);
        } catch(NotEnoughReplicasException e2) {
          //otherwise randomly choose one from the network
          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-              maxNodesPerRack, results, avoidStaleNodes, storageType);
+              maxNodesPerRack, results, avoidStaleNodes, storageTypes);
        }
      } else {
        //otherwise randomly choose one from the network
        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-            maxNodesPerRack, results, avoidStaleNodes, storageType);
+            maxNodesPerRack, results, avoidStaleNodes, storageTypes);
      }
    }
  }
@ -161,8 +169,8 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
  protected void chooseRemoteRack(int numOfReplicas,
      DatanodeDescriptor localMachine, Set<Node> excludedNodes,
      long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
-      boolean avoidStaleNodes, StorageType storageType)
-          throws NotEnoughReplicasException {
+      boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes)
+      throws NotEnoughReplicasException {
    int oldNumOfReplicas = results.size();

    final String rackLocation = NetworkTopology.getFirstHalf(
@ -170,12 +178,12 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
    try {
      // randomly choose from remote racks
      chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
-          maxReplicasPerRack, results, avoidStaleNodes, storageType);
+          maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
    } catch (NotEnoughReplicasException e) {
      // fall back to the local rack
      chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
          rackLocation, excludedNodes, blocksize,
-          maxReplicasPerRack, results, avoidStaleNodes, storageType);
+          maxReplicasPerRack, results, avoidStaleNodes, storageTypes);
    }
  }

@ -189,11 +197,12 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
      NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
      Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
      List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
-      StorageType storageType) throws NotEnoughReplicasException {
+      EnumMap<StorageType, Integer> storageTypes) throws
+      NotEnoughReplicasException {
    // no local machine, so choose a random machine
    if (localMachine == null) {
      return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-          maxNodesPerRack, results, avoidStaleNodes, storageType);
+          maxNodesPerRack, results, avoidStaleNodes, storageTypes);
    }

    // choose one from the local node group
@ -201,7 +210,7 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
      return chooseRandom(
          clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
          excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
-          storageType);
+          storageTypes);
    } catch (NotEnoughReplicasException e1) {
      final DatanodeDescriptor newLocal = secondNode(localMachine, results);
      if (newLocal != null) {
@ -209,16 +218,16 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
          return chooseRandom(
              clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
              excludedNodes, blocksize, maxNodesPerRack, results,
-              avoidStaleNodes, storageType);
+              avoidStaleNodes, storageTypes);
        } catch(NotEnoughReplicasException e2) {
          //otherwise randomly choose one from the network
          return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-              maxNodesPerRack, results, avoidStaleNodes, storageType);
+              maxNodesPerRack, results, avoidStaleNodes, storageTypes);
        }
      } else {
        //otherwise randomly choose one from the network
        return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
-            maxNodesPerRack, results, avoidStaleNodes, storageType);
+            maxNodesPerRack, results, avoidStaleNodes, storageTypes);
      }
    }
  }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
@ -28,16 +28,19 @@ import java.util.Map;
 import java.util.Queue;

 import com.google.common.annotations.VisibleForTesting;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.hdfs.StorageType;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
+import org.apache.hadoop.hdfs.util.EnumCounters;
 import org.apache.hadoop.hdfs.util.LightWeightHashSet;
 import org.apache.hadoop.util.IntrusiveCollection;
 import org.apache.hadoop.util.Time;
@ -202,8 +205,10 @@ public class DatanodeDescriptor extends DatanodeInfo {
   * in case of errors (e.g. datanode does not report if an error occurs
   * while writing the block).
   */
-  private int currApproxBlocksScheduled = 0;
-  private int prevApproxBlocksScheduled = 0;
+  private EnumCounters<StorageType> currApproxBlocksScheduled
+      = new EnumCounters<StorageType>(StorageType.class);
+  private EnumCounters<StorageType> prevApproxBlocksScheduled
+      = new EnumCounters<StorageType>(StorageType.class);
  private long lastBlocksScheduledRollTime = 0;
  private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
  private int volumeFailures = 0;
@ -474,25 +479,48 @@ public class DatanodeDescriptor extends DatanodeInfo {
    }
  }

+  /**
+   * @return Approximate number of blocks currently scheduled to be written 
+   */
+  public long getRemaining(StorageType t) {
+    long remaining = 0;
+    for(DatanodeStorageInfo s : getStorageInfos()) {
+      if (s.getStorageType() == t) {
+        remaining += s.getRemaining();
+      }
+    }
+    return remaining;    
+  }
+
+  /**
+   * @return Approximate number of blocks currently scheduled to be written 
+   * to the given storage type of this datanode.
+   */
+  public int getBlocksScheduled(StorageType t) {
+    return (int)(currApproxBlocksScheduled.get(t)
+        + prevApproxBlocksScheduled.get(t));
+  }
+
  /**
   * @return Approximate number of blocks currently scheduled to be written 
   * to this datanode.
   */
  public int getBlocksScheduled() {
-    return currApproxBlocksScheduled + prevApproxBlocksScheduled;
+    return (int)(currApproxBlocksScheduled.sum()
+        + prevApproxBlocksScheduled.sum());
  }

  /** Increment the number of blocks scheduled. */
-  void incrementBlocksScheduled() {
-    currApproxBlocksScheduled++;
+  void incrementBlocksScheduled(StorageType t) {
+    currApproxBlocksScheduled.add(t, 1);;
  }
  
  /** Decrement the number of blocks scheduled. */
-  void decrementBlocksScheduled() {
-    if (prevApproxBlocksScheduled > 0) {
-      prevApproxBlocksScheduled--;
-    } else if (currApproxBlocksScheduled > 0) {
-      currApproxBlocksScheduled--;
+  void decrementBlocksScheduled(StorageType t) {
+    if (prevApproxBlocksScheduled.get(t) > 0) {
+      prevApproxBlocksScheduled.subtract(t, 1);
+    } else if (currApproxBlocksScheduled.get(t) > 0) {
+      currApproxBlocksScheduled.subtract(t, 1);
    } 
    // its ok if both counters are zero.
  }
@ -500,8 +528,8 @@ public class DatanodeDescriptor extends DatanodeInfo {
  /** Adjusts curr and prev number of blocks scheduled every few minutes. */
  private void rollBlocksScheduled(long now) {
    if (now - lastBlocksScheduledRollTime > BLOCKS_SCHEDULED_ROLL_INTERVAL) {
-      prevApproxBlocksScheduled = currApproxBlocksScheduled;
-      currApproxBlocksScheduled = 0;
+      prevApproxBlocksScheduled.set(currApproxBlocksScheduled);
+      currApproxBlocksScheduled.reset();
      lastBlocksScheduledRollTime = now;
    }
  }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
@ -348,8 +348,7 @@ public class DatanodeManager {
  
  /** Sort the located blocks by the distance to the target host. */
  public void sortLocatedBlocks(final String targethost,
-      final List<LocatedBlock> locatedblocks,
-      boolean randomizeBlockLocationsPerBlock) {
+      final List<LocatedBlock> locatedblocks) {
    //sort the blocks
    // As it is possible for the separation of node manager and datanode, 
    // here we should get node but not datanode only .
@ -376,8 +375,7 @@ public class DatanodeManager {
          --lastActiveIndex;
      }
      int activeLen = lastActiveIndex + 1;      
-      networktopology.sortByDistance(client, b.getLocations(), activeLen, b
-          .getBlock().getBlockId(), randomizeBlockLocationsPerBlock);
+      networktopology.sortByDistance(client, b.getLocations(), activeLen);
    }
  }
  
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStorageInfo.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStorageInfo.java
@ -109,7 +109,7 @@ public class DatanodeStorageInfo {

  private long capacity;
  private long dfsUsed;
-  private long remaining;
+  private volatile long remaining;
  private long blockPoolUsed;

  private volatile BlockInfo blockList = null;
@ -283,7 +283,7 @@ public class DatanodeStorageInfo {
  /** Increment the number of blocks scheduled for each given storage */ 
  public static void incrementBlocksScheduled(DatanodeStorageInfo... storages) {
    for (DatanodeStorageInfo s : storages) {
-      s.getDatanodeDescriptor().incrementBlocksScheduled();
+      s.getDatanodeDescriptor().incrementBlocksScheduled(s.getStorageType());
    }
  }

@ -314,6 +314,26 @@ public class DatanodeStorageInfo {
        false, capacity, dfsUsed, remaining, blockPoolUsed);
  }

+  static Iterable<StorageType> toStorageTypes(
+      final Iterable<DatanodeStorageInfo> infos) {
+    return new Iterable<StorageType>() {
+        @Override
+        public Iterator<StorageType> iterator() {
+          return new Iterator<StorageType>() {
+            final Iterator<DatanodeStorageInfo> i = infos.iterator();
+            @Override
+            public boolean hasNext() {return i.hasNext();}
+            @Override
+            public StorageType next() {return i.next().getStorageType();}
+            @Override
+            public void remove() {
+              throw new UnsupportedOperationException();
+            }
+          };
+        }
+      };
+  }
+
  /** @return the first {@link DatanodeStorageInfo} corresponding to
   *          the given datanode
   */
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/HdfsServerConstants.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/HdfsServerConstants.java
@ -94,7 +94,12 @@ public final class HdfsServerConstants {
    NONINTERACTIVE("-nonInteractive"),
    RENAMERESERVED("-renameReserved"),
    METADATAVERSION("-metadataVersion"),
-    UPGRADEONLY("-upgradeOnly");
+    UPGRADEONLY("-upgradeOnly"),
+    // The -hotswap constant should not be used as a startup option, it is
+    // only used for StorageDirectory.analyzeStorage() in hot swap drive scenario.
+    // TODO refactor StorageDirectory.analyzeStorage() so that we can do away with
+    // this in StartupOption.
+    HOTSWAP("-hotswap");

    private static final Pattern ENUM_WITH_ROLLING_UPGRADE_OPTION = Pattern.compile(
        "(\\w+)\\((\\w+)\\)");
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
@ -464,17 +464,20 @@ public abstract class Storage extends StorageInfo {
    public StorageState analyzeStorage(StartupOption startOpt, Storage storage)
        throws IOException {
      assert root != null : "root is null";
+      boolean hadMkdirs = false;
      String rootPath = root.getCanonicalPath();
      try { // check that storage exists
        if (!root.exists()) {
          // storage directory does not exist
-          if (startOpt != StartupOption.FORMAT) {
+          if (startOpt != StartupOption.FORMAT &&
+              startOpt != StartupOption.HOTSWAP) {
            LOG.warn("Storage directory " + rootPath + " does not exist");
            return StorageState.NON_EXISTENT;
          }
          LOG.info(rootPath + " does not exist. Creating ...");
          if (!root.mkdirs())
            throw new IOException("Cannot create directory " + rootPath);
+          hadMkdirs = true;
        }
        // or is inaccessible
        if (!root.isDirectory()) {
@ -492,7 +495,10 @@ public abstract class Storage extends StorageInfo {

      this.lock(); // lock storage if it exists

-      if (startOpt == HdfsServerConstants.StartupOption.FORMAT)
+      // If startOpt is HOTSWAP, it returns NOT_FORMATTED for empty directory,
+      // while it also checks the layout version.
+      if (startOpt == HdfsServerConstants.StartupOption.FORMAT ||
+          (startOpt == StartupOption.HOTSWAP && hadMkdirs))
        return StorageState.NOT_FORMATTED;

      if (startOpt != HdfsServerConstants.StartupOption.IMPORT) {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
@ -70,8 +70,10 @@ import java.security.PrivilegedExceptionAction;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.EnumSet;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@ -80,11 +82,13 @@ import java.util.concurrent.atomic.AtomicInteger;

 import javax.management.ObjectName;

+import com.google.common.collect.Lists;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.ReconfigurableBase;
+import org.apache.hadoop.conf.ReconfigurationException;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocalFileSystem;
@ -137,6 +141,7 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NodeType;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
 import org.apache.hadoop.hdfs.server.common.JspHelper;
+import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.StorageInfo;
 import org.apache.hadoop.hdfs.server.datanode.SecureDataNodeStarter.SecureResources;
 import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
@ -220,7 +225,7 @@ import com.google.protobuf.BlockingService;
 *
 **********************************************************/
@InterfaceAudience.Private
-public class DataNode extends Configured 
+public class DataNode extends ReconfigurableBase
    implements InterDatanodeProtocol, ClientDatanodeProtocol,
    DataNodeMXBean {
  public static final Log LOG = LogFactory.getLog(DataNode.class);
@ -305,6 +310,7 @@ public class DataNode extends Configured
  private JvmPauseMonitor pauseMonitor;

  private SecureResources secureResources = null;
+  // dataDirs must be accessed while holding the DataNode lock.
  private List<StorageLocation> dataDirs;
  private Configuration conf;
  private final String confVersion;
@ -386,6 +392,149 @@ public class DataNode extends Configured
    }
  }

+  @Override
+  public void reconfigurePropertyImpl(String property, String newVal)
+      throws ReconfigurationException {
+    if (property.equals(DFS_DATANODE_DATA_DIR_KEY)) {
+      try {
+        LOG.info("Reconfiguring " + property + " to " + newVal);
+        this.refreshVolumes(newVal);
+      } catch (Exception e) {
+        throw new ReconfigurationException(property, newVal,
+            getConf().get(property), e);
+      }
+    } else {
+      throw new ReconfigurationException(
+          property, newVal, getConf().get(property));
+    }
+  }
+
+  /**
+   * Get a list of the keys of the re-configurable properties in configuration.
+   */
+  @Override
+  public Collection<String> getReconfigurableProperties() {
+    List<String> reconfigurable =
+        Collections.unmodifiableList(Arrays.asList(DFS_DATANODE_DATA_DIR_KEY));
+    return reconfigurable;
+  }
+
+  /**
+   * Contains the StorageLocations for changed data volumes.
+   */
+  @VisibleForTesting
+  static class ChangedVolumes {
+    List<StorageLocation> newLocations = Lists.newArrayList();
+    List<StorageLocation> deactivateLocations = Lists.newArrayList();
+  }
+
+  /**
+   * Parse the new DFS_DATANODE_DATA_DIR value in the configuration to detect
+   * changed volumes.
+   * @return changed volumes.
+   * @throws IOException if none of the directories are specified in the
+   * configuration.
+   */
+  @VisibleForTesting
+  ChangedVolumes parseChangedVolumes() throws IOException {
+    List<StorageLocation> locations = getStorageLocations(getConf());
+
+    if (locations.isEmpty()) {
+      throw new IOException("No directory is specified.");
+    }
+
+    ChangedVolumes results = new ChangedVolumes();
+    results.newLocations.addAll(locations);
+
+    for (Iterator<Storage.StorageDirectory> it = storage.dirIterator();
+         it.hasNext(); ) {
+      Storage.StorageDirectory dir = it.next();
+      boolean found = false;
+      for (Iterator<StorageLocation> sl = results.newLocations.iterator();
+           sl.hasNext(); ) {
+        if (sl.next().getFile().getCanonicalPath().equals(
+            dir.getRoot().getCanonicalPath())) {
+          sl.remove();
+          found = true;
+          break;
+        }
+      }
+
+      if (!found) {
+        results.deactivateLocations.add(
+            StorageLocation.parse(dir.getRoot().toString()));
+      }
+    }
+
+    return results;
+  }
+
+  /**
+   * Attempts to reload data volumes with new configuration.
+   * @param newVolumes a comma separated string that specifies the data volumes.
+   * @throws Exception
+   */
+  private synchronized void refreshVolumes(String newVolumes) throws Exception {
+    Configuration conf = getConf();
+    String oldVolumes = conf.get(DFS_DATANODE_DATA_DIR_KEY);
+    conf.set(DFS_DATANODE_DATA_DIR_KEY, newVolumes);
+    List<StorageLocation> locations = getStorageLocations(conf);
+
+    final int numOldDataDirs = dataDirs.size();
+    dataDirs = locations;
+    ChangedVolumes changedVolumes = parseChangedVolumes();
+
+    try {
+      if (numOldDataDirs + changedVolumes.newLocations.size() -
+          changedVolumes.deactivateLocations.size() <= 0) {
+        throw new IOException("Attempt to remove all volumes.");
+      }
+      if (!changedVolumes.newLocations.isEmpty()) {
+        LOG.info("Adding new volumes: " +
+            Joiner.on(",").join(changedVolumes.newLocations));
+
+        // Add volumes for each Namespace
+        for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) {
+          NamespaceInfo nsInfo = bpos.getNamespaceInfo();
+          LOG.info("Loading volumes for namesapce: " + nsInfo.getNamespaceID());
+          storage.addStorageLocations(
+              this, nsInfo, changedVolumes.newLocations, StartupOption.HOTSWAP);
+        }
+        List<String> bpids = Lists.newArrayList();
+        for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) {
+          bpids.add(bpos.getBlockPoolId());
+        }
+        List<StorageLocation> succeedVolumes =
+            data.addVolumes(changedVolumes.newLocations, bpids);
+
+        if (succeedVolumes.size() < changedVolumes.newLocations.size()) {
+          List<StorageLocation> failedVolumes = Lists.newArrayList();
+          // Clean all failed volumes.
+          for (StorageLocation location : changedVolumes.newLocations) {
+            if (!succeedVolumes.contains(location)) {
+              failedVolumes.add(location);
+            }
+          }
+          storage.removeVolumes(failedVolumes);
+          data.removeVolumes(failedVolumes);
+        }
+      }
+
+      if (!changedVolumes.deactivateLocations.isEmpty()) {
+        LOG.info("Deactivating volumes: " +
+            Joiner.on(",").join(changedVolumes.deactivateLocations));
+
+        data.removeVolumes(changedVolumes.deactivateLocations);
+        storage.removeVolumes(changedVolumes.deactivateLocations);
+      }
+    } catch (IOException e) {
+      LOG.warn("There is IOException when refreshing volumes! "
+          + "Recover configurations: " + DFS_DATANODE_DATA_DIR_KEY
+          + " = " + oldVolumes, e);
+      throw e;
+    }
+  }
+
  private synchronized void setClusterId(final String nsCid, final String bpid
      ) throws IOException {
    if(clusterId != null && !clusterId.equals(nsCid)) {
@ -822,7 +971,9 @@ public class DataNode extends Configured

    // settings global for all BPs in the Data Node
    this.secureResources = resources;
-    this.dataDirs = dataDirs;
+    synchronized (this) {
+      this.dataDirs = dataDirs;
+    }
    this.conf = conf;
    this.dnConf = new DNConf(conf);
    this.spanReceiverHost = SpanReceiverHost.getInstance(conf);
@ -1112,7 +1263,9 @@ public class DataNode extends Configured
      }
      final String bpid = nsInfo.getBlockPoolID();
      //read storage info, lock data dirs and transition fs state if necessary
-      storage.recoverTransitionRead(this, bpid, nsInfo, dataDirs, startOpt);
+      synchronized (this) {
+        storage.recoverTransitionRead(this, bpid, nsInfo, dataDirs, startOpt);
+      }
      final StorageInfo bpStorage = storage.getBPStorage(bpid);
      LOG.info("Setting up storage: nsid=" + bpStorage.getNamespaceID()
          + ";bpid=" + bpid + ";lv=" + storage.getLayoutVersion()
@ -1744,7 +1897,9 @@ public class DataNode extends Configured
            + b + " (numBytes=" + b.getNumBytes() + ")"
            + ", stage=" + stage
            + ", clientname=" + clientname
-            + ", targets=" + Arrays.asList(targets));
+            + ", targets=" + Arrays.asList(targets)
+            + ", target storage types=" + (targetStorageTypes == null ? "[]" :
+            Arrays.asList(targetStorageTypes)));
      }
      this.targets = targets;
      this.targetStorageTypes = targetStorageTypes;
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java
@ -94,8 +94,8 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
  public List<V> getVolumes();

  /** Add an array of StorageLocation to FsDataset. */
-  public void addVolumes(Collection<StorageLocation> volumes)
-      throws IOException;
+  public List<StorageLocation> addVolumes(List<StorageLocation> volumes,
+      final Collection<String> bpids);

  /** Removes a collection of volumes from FsDataset. */
  public void removeVolumes(Collection<StorageLocation> volumes);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java
@ -28,13 +28,16 @@ import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Executor;

 import javax.management.NotCompliantMBeanException;
@ -42,6 +45,7 @@ import javax.management.ObjectName;
 import javax.management.StandardMBean;

 import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@ -88,6 +92,7 @@ import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlo
 import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
 import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
 import org.apache.hadoop.hdfs.server.protocol.StorageReport;
+import org.apache.hadoop.io.MultipleIOException;
 import org.apache.hadoop.io.nativeio.NativeIO;
 import org.apache.hadoop.metrics2.util.MBeans;
 import org.apache.hadoop.util.Daemon;
@ -255,7 +260,7 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
          + ", volume failures tolerated: " + volFailuresTolerated);
    }

-    storageMap = new HashMap<String, DatanodeStorage>();
+    storageMap = new ConcurrentHashMap<String, DatanodeStorage>();
    volumeMap = new ReplicaMap(this);
    lazyWriteReplicaTracker = new LazyWriteReplicaTracker(this);

@ -291,45 +296,124 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
    // storageMap and asyncDiskService, consistent.
    FsVolumeImpl fsVolume = FsVolumeImplAllocator.createVolume(
        this, sd.getStorageUuid(), dir, this.conf, storageType);
-    fsVolume.getVolumeMap(volumeMap, lazyWriteReplicaTracker);
+    ReplicaMap tempVolumeMap = new ReplicaMap(this);
+    fsVolume.getVolumeMap(tempVolumeMap, lazyWriteReplicaTracker);

+    volumeMap.addAll(tempVolumeMap);
    volumes.addVolume(fsVolume);
    storageMap.put(sd.getStorageUuid(),
        new DatanodeStorage(sd.getStorageUuid(),
-                            DatanodeStorage.State.NORMAL,
-                            storageType));
+            DatanodeStorage.State.NORMAL,
+            storageType));
    asyncDiskService.addVolume(sd.getCurrentDir());

    LOG.info("Added volume - " + dir + ", StorageType: " + storageType);
  }

+  private void addVolumeAndBlockPool(Collection<StorageLocation> dataLocations,
+      Storage.StorageDirectory sd, final Collection<String> bpids)
+      throws IOException {
+    final File dir = sd.getCurrentDir();
+    final StorageType storageType =
+        getStorageTypeFromLocations(dataLocations, sd.getRoot());
+
+    final FsVolumeImpl fsVolume = new FsVolumeImpl(
+        this, sd.getStorageUuid(), dir, this.conf, storageType);
+    final ReplicaMap tempVolumeMap = new ReplicaMap(fsVolume);
+
+    List<IOException> exceptions = Lists.newArrayList();
+    for (final String bpid : bpids) {
+      try {
+        fsVolume.addBlockPool(bpid, this.conf);
+        fsVolume.getVolumeMap(bpid, tempVolumeMap, lazyWriteReplicaTracker);
+      } catch (IOException e) {
+        LOG.warn("Caught exception when adding " + fsVolume +
+            ". Will throw later.", e);
+        exceptions.add(e);
+      }
+    }
+    if (!exceptions.isEmpty()) {
+      // The states of FsDatasteImpl are not modified, thus no need to rolled back.
+      throw MultipleIOException.createIOException(exceptions);
+    }
+
+    volumeMap.addAll(tempVolumeMap);
+    storageMap.put(sd.getStorageUuid(),
+        new DatanodeStorage(sd.getStorageUuid(),
+            DatanodeStorage.State.NORMAL,
+            storageType));
+    asyncDiskService.addVolume(sd.getCurrentDir());
+    volumes.addVolume(fsVolume);
+
+    LOG.info("Added volume - " + dir + ", StorageType: " + storageType);
+  }
+
  /**
   * Add an array of StorageLocation to FsDataset.
   *
   * @pre dataStorage must have these volumes.
-   * @param volumes
-   * @throws IOException
+   * @param volumes an array of storage locations for adding volumes.
+   * @param bpids block pool IDs.
+   * @return an array of successfully loaded volumes.
   */
  @Override
-  public synchronized void addVolumes(Collection<StorageLocation> volumes)
-      throws IOException {
+  public synchronized List<StorageLocation> addVolumes(
+      final List<StorageLocation> volumes, final Collection<String> bpids) {
    final Collection<StorageLocation> dataLocations =
        DataNode.getStorageLocations(this.conf);
-    Map<String, Storage.StorageDirectory> allStorageDirs =
+    final Map<String, Storage.StorageDirectory> allStorageDirs =
        new HashMap<String, Storage.StorageDirectory>();
-    for (int idx = 0; idx < dataStorage.getNumStorageDirs(); idx++) {
-      Storage.StorageDirectory sd = dataStorage.getStorageDir(idx);
-      allStorageDirs.put(sd.getRoot().getAbsolutePath(), sd);
+    List<StorageLocation> succeedVolumes = Lists.newArrayList();
+    try {
+      for (int idx = 0; idx < dataStorage.getNumStorageDirs(); idx++) {
+        Storage.StorageDirectory sd = dataStorage.getStorageDir(idx);
+        allStorageDirs.put(sd.getRoot().getCanonicalPath(), sd);
+      }
+    } catch (IOException ioe) {
+      LOG.warn("Caught exception when parsing storage URL.", ioe);
+      return succeedVolumes;
    }

-    for (StorageLocation vol : volumes) {
-      String key = vol.getFile().getAbsolutePath();
-      if (!allStorageDirs.containsKey(key)) {
-        LOG.warn("Attempt to add an invalid volume: " + vol.getFile());
-      } else {
-        addVolume(dataLocations, allStorageDirs.get(key));
+    final boolean[] successFlags = new boolean[volumes.size()];
+    Arrays.fill(successFlags, false);
+    List<Thread> volumeAddingThreads = Lists.newArrayList();
+    for (int i = 0; i < volumes.size(); i++) {
+      final int idx = i;
+      Thread t = new Thread() {
+        public void run() {
+          StorageLocation vol = volumes.get(idx);
+          try {
+            String key = vol.getFile().getCanonicalPath();
+            if (!allStorageDirs.containsKey(key)) {
+              LOG.warn("Attempt to add an invalid volume: " + vol.getFile());
+            } else {
+              addVolumeAndBlockPool(dataLocations, allStorageDirs.get(key),
+                  bpids);
+              successFlags[idx] = true;
+            }
+          } catch (IOException e) {
+            LOG.warn("Caught exception when adding volume " + vol, e);
+          }
+        }
+      };
+      volumeAddingThreads.add(t);
+      t.start();
+    }
+
+    for (Thread t : volumeAddingThreads) {
+      try {
+        t.join();
+      } catch (InterruptedException e) {
+        LOG.warn("Caught InterruptedException when adding volume.", e);
      }
    }
+
+    for (int i = 0; i < volumes.size(); i++) {
+      if (successFlags[i]) {
+        succeedVolumes.add(volumes.get(i));
+      }
+    }
+    return succeedVolumes;
  }

  /**
@ -351,9 +435,9 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
        String volume = sd.getRoot().toString();
        LOG.info("Removing " + volume + " from FsDataset.");

-        this.volumes.removeVolume(volume);
-        storageMap.remove(sd.getStorageUuid());
+        // Disable the volume from the service.
        asyncDiskService.removeVolume(sd.getCurrentDir());
+        this.volumes.removeVolume(volume);

        // Removed all replica information for the blocks on the volume. Unlike
        // updating the volumeMap in addVolume(), this operation does not scan
@ -364,7 +448,7 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
              it.hasNext(); ) {
            ReplicaInfo block = it.next();
            if (block.getVolume().getBasePath().equals(volume)) {
-              invalidate(bpid, block.getBlockId());
+              invalidate(bpid, block);
              blocks.add(block);
              it.remove();
            }
@ -373,6 +457,8 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
          datanode.getBlockScanner().deleteBlocks(bpid,
              blocks.toArray(new Block[blocks.size()]));
        }
+
+        storageMap.remove(sd.getStorageUuid());
      }
    }
  }
@ -1434,23 +1520,26 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
  /**
   * Invalidate a block but does not delete the actual on-disk block file.
   *
-   * It should only be used for decommissioning disks.
+   * It should only be used when deactivating disks.
   *
   * @param bpid the block pool ID.
-   * @param blockId the ID of the block.
+   * @param block The block to be invalidated.
   */
-  public void invalidate(String bpid, long blockId) {
+  public void invalidate(String bpid, ReplicaInfo block) {
    // If a DFSClient has the replica in its cache of short-circuit file
    // descriptors (and the client is using ShortCircuitShm), invalidate it.
    // The short-circuit registry is null in the unit tests, because the
    // datanode is mock object.
    if (datanode.getShortCircuitRegistry() != null) {
      datanode.getShortCircuitRegistry().processBlockInvalidation(
-          new ExtendedBlockId(blockId, bpid));
+          new ExtendedBlockId(block.getBlockId(), bpid));

      // If the block is cached, start uncaching it.
-      cacheManager.uncacheBlock(bpid, blockId);
+      cacheManager.uncacheBlock(bpid, block.getBlockId());
    }
+
+    datanode.notifyNamenodeDeletedBlock(new ExtendedBlock(bpid, block),
+        block.getStorageUuid());
  }

  /**
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
@ -0,0 +1,644 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.mover;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import org.apache.commons.cli.*;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.*;
+import org.apache.hadoop.hdfs.protocol.*;
+import org.apache.hadoop.hdfs.server.balancer.Dispatcher;
+import org.apache.hadoop.hdfs.server.balancer.Dispatcher.*;
+import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;
+import org.apache.hadoop.hdfs.server.balancer.ExitStatus;
+import org.apache.hadoop.hdfs.server.balancer.Matcher;
+import org.apache.hadoop.hdfs.server.balancer.NameNodeConnector;
+import org.apache.hadoop.hdfs.server.namenode.INode;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;
+import org.apache.hadoop.hdfs.server.protocol.StorageReport;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.net.NetworkTopology;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Time;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.net.URI;
+import java.text.DateFormat;
+import java.util.*;
+
+@InterfaceAudience.Private
+public class Mover {
+  static final Log LOG = LogFactory.getLog(Mover.class);
+
+  static final Path MOVER_ID_PATH = new Path("/system/mover.id");
+
+  private static class StorageMap {
+    private final StorageGroupMap<Source> sources
+        = new StorageGroupMap<Source>();
+    private final StorageGroupMap<StorageGroup> targets
+        = new StorageGroupMap<StorageGroup>();
+    private final EnumMap<StorageType, List<StorageGroup>> targetStorageTypeMap
+        = new EnumMap<StorageType, List<StorageGroup>>(StorageType.class);
+    
+    private StorageMap() {
+      for(StorageType t : StorageType.asList()) {
+        targetStorageTypeMap.put(t, new LinkedList<StorageGroup>());
+      }
+    }
+    
+    private void add(Source source, StorageGroup target) {
+      sources.put(source);
+      targets.put(target);
+      getTargetStorages(target.getStorageType()).add(target);
+    }
+    
+    private Source getSource(MLocation ml) {
+      return get(sources, ml);
+    }
+
+    private StorageGroup getTarget(MLocation ml) {
+      return get(targets, ml);
+    }
+
+    private static <G extends StorageGroup> G get(StorageGroupMap<G> map, MLocation ml) {
+      return map.get(ml.datanode.getDatanodeUuid(), ml.storageType);
+    }
+    
+    private List<StorageGroup> getTargetStorages(StorageType t) {
+      return targetStorageTypeMap.get(t);
+    }
+  }
+
+  private final Dispatcher dispatcher;
+  private final StorageMap storages;
+  private final List<Path> targetPaths;
+
+  private final BlockStoragePolicy.Suite blockStoragePolicies;
+
+  Mover(NameNodeConnector nnc, Configuration conf) {
+    final long movedWinWidth = conf.getLong(
+        DFSConfigKeys.DFS_MOVER_MOVEDWINWIDTH_KEY,
+        DFSConfigKeys.DFS_MOVER_MOVEDWINWIDTH_DEFAULT);
+    final int moverThreads = conf.getInt(
+        DFSConfigKeys.DFS_MOVER_MOVERTHREADS_KEY,
+        DFSConfigKeys.DFS_MOVER_MOVERTHREADS_DEFAULT);
+    final int maxConcurrentMovesPerNode = conf.getInt(
+        DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_KEY,
+        DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_DEFAULT);
+
+    this.dispatcher = new Dispatcher(nnc, Collections.<String> emptySet(),
+        Collections.<String> emptySet(), movedWinWidth, moverThreads, 0,
+        maxConcurrentMovesPerNode, conf);
+    this.storages = new StorageMap();
+    this.blockStoragePolicies = BlockStoragePolicy.readBlockStorageSuite(conf);
+    this.targetPaths = nnc.getTargetPaths();
+  }
+
+  void init() throws IOException {
+    final List<DatanodeStorageReport> reports = dispatcher.init();
+    for(DatanodeStorageReport r : reports) {
+      final DDatanode dn = dispatcher.newDatanode(r.getDatanodeInfo());
+      for(StorageType t : StorageType.asList()) {
+        final long maxRemaining = getMaxRemaining(r, t);
+        if (maxRemaining > 0L) {
+          final Source source = dn.addSource(t, Long.MAX_VALUE, dispatcher); 
+          final StorageGroup target = dn.addTarget(t, maxRemaining);
+          storages.add(source, target);
+        }
+      }
+    }
+  }
+
+  private ExitStatus run() {
+    try {
+      init();
+      boolean hasRemaining = new Processor().processNamespace();
+      return hasRemaining ? ExitStatus.IN_PROGRESS : ExitStatus.SUCCESS;
+    } catch (IllegalArgumentException e) {
+      System.out.println(e + ".  Exiting ...");
+      return ExitStatus.ILLEGAL_ARGUMENTS;
+    } catch (IOException e) {
+      System.out.println(e + ".  Exiting ...");
+      return ExitStatus.IO_EXCEPTION;
+    } finally {
+      dispatcher.shutdownNow();
+    }
+  }
+
+  DBlock newDBlock(Block block, List<MLocation> locations) {
+    final DBlock db = new DBlock(block);
+    for(MLocation ml : locations) {
+      db.addLocation(storages.getTarget(ml));
+    }
+    return db;
+  }
+
+  private static long getMaxRemaining(DatanodeStorageReport report, StorageType t) {
+    long max = 0L;
+    for(StorageReport r : report.getStorageReports()) {
+      if (r.getStorage().getStorageType() == t) {
+        if (r.getRemaining() > max) {
+          max = r.getRemaining();
+        }
+      }
+    }
+    return max;
+  }
+
+  /**
+   * convert a snapshot path to non-snapshot path. E.g.,
+   * /foo/.snapshot/snapshot-name/bar --> /foo/bar
+   */
+  private static String convertSnapshotPath(String[] pathComponents) {
+    StringBuilder sb = new StringBuilder(Path.SEPARATOR);
+    for (int i = 0; i < pathComponents.length; i++) {
+      if (pathComponents[i].equals(HdfsConstants.DOT_SNAPSHOT_DIR)) {
+        i++;
+      } else {
+        sb.append(pathComponents[i]);
+      }
+    }
+    return sb.toString();
+  }
+
+  class Processor {
+    private final DFSClient dfs;
+    private final List<String> snapshottableDirs = new ArrayList<String>();
+
+    Processor() {
+      dfs = dispatcher.getDistributedFileSystem().getClient();
+    }
+
+    private void getSnapshottableDirs() {
+      SnapshottableDirectoryStatus[] dirs = null;
+      try {
+        dirs = dfs.getSnapshottableDirListing();
+      } catch (IOException e) {
+        LOG.warn("Failed to get snapshottable directories."
+            + " Ignore and continue.", e);
+      }
+      if (dirs != null) {
+        for (SnapshottableDirectoryStatus dir : dirs) {
+          snapshottableDirs.add(dir.getFullPath().toString());
+        }
+      }
+    }
+
+    /**
+     * @return true if the given path is a snapshot path and the corresponding
+     * INode is still in the current fsdirectory.
+     */
+    private boolean isSnapshotPathInCurrent(String path) throws IOException {
+      // if the parent path contains "/.snapshot/", this is a snapshot path
+      if (path.contains(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR_SEPARATOR)) {
+        String[] pathComponents = INode.getPathNames(path);
+        if (HdfsConstants.DOT_SNAPSHOT_DIR
+            .equals(pathComponents[pathComponents.length - 2])) {
+          // this is a path for a specific snapshot (e.g., /foo/.snapshot/s1)
+          return false;
+        }
+        String nonSnapshotPath = convertSnapshotPath(pathComponents);
+        return dfs.getFileInfo(nonSnapshotPath) != null;
+      } else {
+        return false;
+      }
+    }
+
+    /**
+     * @return whether there is still remaining migration work for the next
+     *         round
+     */
+    private boolean processNamespace() {
+      getSnapshottableDirs();
+      boolean hasRemaining = true;
+      try {
+        for (Path target : targetPaths) {
+          hasRemaining = processDirRecursively("", dfs.getFileInfo(target
+              .toUri().getPath()));
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed to get root directory status. Ignore and continue.", e);
+      }
+      // wait for pending move to finish and retry the failed migration
+      hasRemaining |= Dispatcher.waitForMoveCompletion(storages.targets.values());
+      return hasRemaining;
+    }
+
+    /**
+     * @return whether there is still remaing migration work for the next
+     *         round
+     */
+    private boolean processChildrenList(String fullPath) {
+      boolean hasRemaining = false;
+      for (byte[] lastReturnedName = HdfsFileStatus.EMPTY_NAME;;) {
+        final DirectoryListing children;
+        try {
+          children = dfs.listPaths(fullPath, lastReturnedName, true);
+        } catch(IOException e) {
+          LOG.warn("Failed to list directory " + fullPath
+              + ". Ignore the directory and continue.", e);
+          return hasRemaining;
+        }
+        if (children == null) {
+          return hasRemaining;
+        }
+        for (HdfsFileStatus child : children.getPartialListing()) {
+          hasRemaining |= processDirRecursively(fullPath, child);
+        }
+        if (children.hasMore()) {
+          lastReturnedName = children.getLastName();
+        } else {
+          return hasRemaining;
+        }
+      }
+    }
+
+    /** @return whether the migration requires next round */
+    private boolean processDirRecursively(String parent,
+                                          HdfsFileStatus status) {
+      String fullPath = status.getFullName(parent);
+      boolean hasRemaining = false;
+      if (status.isDir()) {
+        if (!fullPath.endsWith(Path.SEPARATOR)) {
+          fullPath = fullPath + Path.SEPARATOR;
+        }
+
+        hasRemaining = processChildrenList(fullPath);
+        // process snapshots if this is a snapshottable directory
+        if (snapshottableDirs.contains(fullPath)) {
+          final String dirSnapshot = fullPath + HdfsConstants.DOT_SNAPSHOT_DIR;
+          hasRemaining |= processChildrenList(dirSnapshot);
+        }
+      } else if (!status.isSymlink()) { // file
+        try {
+          if (!isSnapshotPathInCurrent(fullPath)) {
+            // the full path is a snapshot path but it is also included in the
+            // current directory tree, thus ignore it.
+            hasRemaining = processFile((HdfsLocatedFileStatus)status);
+          }
+        } catch (IOException e) {
+          LOG.warn("Failed to check the status of " + parent
+              + ". Ignore it and continue.", e);
+          return false;
+        }
+      }
+      return hasRemaining;
+    }
+
+    /** @return true if it is necessary to run another round of migration */
+    private boolean processFile(HdfsLocatedFileStatus status) {
+      final BlockStoragePolicy policy = blockStoragePolicies.getPolicy(
+          status.getStoragePolicy());
+      final List<StorageType> types = policy.chooseStorageTypes(
+          status.getReplication());
+
+      final LocatedBlocks locatedBlocks = status.getBlockLocations();
+      boolean hasRemaining = false;
+      final boolean lastBlkComplete = locatedBlocks.isLastBlockComplete();
+      List<LocatedBlock> lbs = locatedBlocks.getLocatedBlocks();
+      for(int i = 0; i < lbs.size(); i++) {
+        if (i == lbs.size() - 1 && !lastBlkComplete) {
+          // last block is incomplete, skip it
+          continue;
+        }
+        LocatedBlock lb = lbs.get(i);
+        final StorageTypeDiff diff = new StorageTypeDiff(types,
+            lb.getStorageTypes());
+        if (!diff.removeOverlap()) {
+          if (scheduleMoves4Block(diff, lb)) {
+            hasRemaining |= (diff.existing.size() > 1 &&
+                diff.expected.size() > 1);
+          }
+        }
+      }
+      return hasRemaining;
+    }
+
+    boolean scheduleMoves4Block(StorageTypeDiff diff, LocatedBlock lb) {
+      final List<MLocation> locations = MLocation.toLocations(lb);
+      Collections.shuffle(locations);
+      final DBlock db = newDBlock(lb.getBlock().getLocalBlock(), locations);
+
+      for (final StorageType t : diff.existing) {
+        for (final MLocation ml : locations) {
+          final Source source = storages.getSource(ml);
+          if (ml.storageType == t) {
+            // try to schedule one replica move.
+            if (scheduleMoveReplica(db, source, diff.expected)) {
+              return true;
+            }
+          }
+        }
+      }
+      return false;
+    }
+
+    @VisibleForTesting
+    boolean scheduleMoveReplica(DBlock db, MLocation ml,
+                                List<StorageType> targetTypes) {
+      return scheduleMoveReplica(db, storages.getSource(ml), targetTypes);
+    }
+
+    boolean scheduleMoveReplica(DBlock db, Source source,
+        List<StorageType> targetTypes) {
+      if (dispatcher.getCluster().isNodeGroupAware()) {
+        if (chooseTarget(db, source, targetTypes, Matcher.SAME_NODE_GROUP)) {
+          return true;
+        }
+      }
+      
+      // Then, match nodes on the same rack
+      if (chooseTarget(db, source, targetTypes, Matcher.SAME_RACK)) {
+        return true;
+      }
+      // At last, match all remaining nodes
+      return chooseTarget(db, source, targetTypes, Matcher.ANY_OTHER);
+    }
+
+    boolean chooseTarget(DBlock db, Source source,
+        List<StorageType> targetTypes, Matcher matcher) {
+      final NetworkTopology cluster = dispatcher.getCluster(); 
+      for (StorageType t : targetTypes) {
+        for(StorageGroup target : storages.getTargetStorages(t)) {
+          if (matcher.match(cluster, source.getDatanodeInfo(),
+              target.getDatanodeInfo())) {
+            final PendingMove pm = source.addPendingMove(db, target);
+            if (pm != null) {
+              dispatcher.executePendingMove(pm);
+              return true;
+            }
+          }
+        }
+      }
+      return false;
+    }
+  }
+
+  static class MLocation {
+    final DatanodeInfo datanode;
+    final StorageType storageType;
+    final long size;
+    
+    MLocation(DatanodeInfo datanode, StorageType storageType, long size) {
+      this.datanode = datanode;
+      this.storageType = storageType;
+      this.size = size;
+    }
+    
+    static List<MLocation> toLocations(LocatedBlock lb) {
+      final DatanodeInfo[] datanodeInfos = lb.getLocations();
+      final StorageType[] storageTypes = lb.getStorageTypes();
+      final long size = lb.getBlockSize();
+      final List<MLocation> locations = new LinkedList<MLocation>();
+      for(int i = 0; i < datanodeInfos.length; i++) {
+        locations.add(new MLocation(datanodeInfos[i], storageTypes[i], size));
+      }
+      return locations;
+    }
+  }
+
+  @VisibleForTesting
+  static class StorageTypeDiff {
+    final List<StorageType> expected;
+    final List<StorageType> existing;
+
+    StorageTypeDiff(List<StorageType> expected, StorageType[] existing) {
+      this.expected = new LinkedList<StorageType>(expected);
+      this.existing = new LinkedList<StorageType>(Arrays.asList(existing));
+    }
+    
+    /**
+     * Remove the overlap between the expected types and the existing types.
+     * @return if the existing types or the expected types is empty after
+     *         removing the overlap.
+     */
+    boolean removeOverlap() { 
+      for(Iterator<StorageType> i = existing.iterator(); i.hasNext(); ) {
+        final StorageType t = i.next();
+        if (expected.remove(t)) {
+          i.remove();
+        }
+      }
+      return expected.isEmpty() || existing.isEmpty();
+    }
+    
+    @Override
+    public String toString() {
+      return getClass().getSimpleName() + "{expected=" + expected
+          + ", existing=" + existing + "}";
+    }
+  }
+
+  static int run(Map<URI, List<Path>> namenodes, Configuration conf)
+      throws IOException, InterruptedException {
+    final long sleeptime =
+        conf.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
+            DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 2000 +
+        conf.getLong(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY,
+            DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_DEFAULT) * 1000;
+    LOG.info("namenodes = " + namenodes);
+    
+    List<NameNodeConnector> connectors = Collections.emptyList();
+    try {
+      connectors = NameNodeConnector.newNameNodeConnectors(namenodes,
+            Mover.class.getSimpleName(), MOVER_ID_PATH, conf);
+
+      while (connectors.size() > 0) {
+        Collections.shuffle(connectors);
+        Iterator<NameNodeConnector> iter = connectors.iterator();
+        while (iter.hasNext()) {
+          NameNodeConnector nnc = iter.next();
+          final Mover m = new Mover(nnc, conf);
+          final ExitStatus r = m.run();
+
+          if (r == ExitStatus.SUCCESS) {
+            IOUtils.cleanup(LOG, nnc);
+            iter.remove();
+          } else if (r != ExitStatus.IN_PROGRESS) {
+            // must be an error statue, return
+            return r.getExitCode();
+          }
+        }
+        Thread.sleep(sleeptime);
+      }
+      return ExitStatus.SUCCESS.getExitCode();
+    } finally {
+      for (NameNodeConnector nnc : connectors) {
+        IOUtils.cleanup(LOG, nnc);
+      }
+    }
+  }
+
+  static class Cli extends Configured implements Tool {
+    private static final String USAGE = "Usage: java "
+        + Mover.class.getSimpleName() + " [-p <files/dirs> | -f <local file>]"
+        + "\n\t-p <files/dirs>\ta space separated list of HDFS files/dirs to migrate."
+        + "\n\t-f <local file>\ta local file containing a list of HDFS files/dirs to migrate.";
+
+    private static Options buildCliOptions() {
+      Options opts = new Options();
+      Option file = OptionBuilder.withArgName("pathsFile").hasArg()
+          .withDescription("a local file containing files/dirs to migrate")
+          .create("f");
+      Option paths = OptionBuilder.withArgName("paths").hasArgs()
+          .withDescription("specify space separated files/dirs to migrate")
+          .create("p");
+      OptionGroup group = new OptionGroup();
+      group.addOption(file);
+      group.addOption(paths);
+      opts.addOptionGroup(group);
+      return opts;
+    }
+
+    private static String[] readPathFile(String file) throws IOException {
+      List<String> list = Lists.newArrayList();
+      BufferedReader reader = new BufferedReader(new FileReader(file));
+      try {
+        String line;
+        while ((line = reader.readLine()) != null) {
+          if (!line.trim().isEmpty()) {
+            list.add(line);
+          }
+        }
+      } finally {
+        IOUtils.cleanup(LOG, reader);
+      }
+      return list.toArray(new String[list.size()]);
+    }
+
+    private static Map<URI, List<Path>> getNameNodePaths(CommandLine line,
+        Configuration conf) throws Exception {
+      Map<URI, List<Path>> map = Maps.newHashMap();
+      String[] paths = null;
+      if (line.hasOption("f")) {
+        paths = readPathFile(line.getOptionValue("f"));
+      } else if (line.hasOption("p")) {
+        paths = line.getOptionValues("p");
+      }
+      Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+      if (paths == null || paths.length == 0) {
+        for (URI namenode : namenodes) {
+          map.put(namenode, null);
+        }
+        return map;
+      }
+      final URI singleNs = namenodes.size() == 1 ?
+          namenodes.iterator().next() : null;
+      for (String path : paths) {
+        Path target = new Path(path);
+        if (!target.isUriPathAbsolute()) {
+          throw new IllegalArgumentException("The path " + target
+              + " is not absolute");
+        }
+        URI targetUri = target.toUri();
+        if ((targetUri.getAuthority() == null || targetUri.getScheme() ==
+            null) && singleNs == null) {
+          // each path must contains both scheme and authority information
+          // unless there is only one name service specified in the
+          // configuration
+          throw new IllegalArgumentException("The path " + target
+              + " does not contain scheme and authority thus cannot identify"
+              + " its name service");
+        }
+        URI key = singleNs;
+        if (singleNs == null) {
+          key = new URI(targetUri.getScheme(), targetUri.getAuthority(),
+              null, null, null);
+          if (!namenodes.contains(key)) {
+            throw new IllegalArgumentException("Cannot resolve the path " +
+                target + ". The namenode services specified in the " +
+                "configuration: " + namenodes);
+          }
+        }
+        List<Path> targets = map.get(key);
+        if (targets == null) {
+          targets = Lists.newArrayList();
+          map.put(key, targets);
+        }
+        targets.add(Path.getPathWithoutSchemeAndAuthority(target));
+      }
+      return map;
+    }
+
+    @VisibleForTesting
+    static Map<URI, List<Path>> getNameNodePathsToMove(Configuration conf,
+        String... args) throws Exception {
+      final Options opts = buildCliOptions();
+      CommandLineParser parser = new GnuParser();
+      CommandLine commandLine = parser.parse(opts, args, true);
+      return getNameNodePaths(commandLine, conf);
+    }
+
+    @Override
+    public int run(String[] args) throws Exception {
+      final long startTime = Time.monotonicNow();
+      final Configuration conf = getConf();
+
+      try {
+        final Map<URI, List<Path>> map = getNameNodePathsToMove(conf, args);
+        return Mover.run(map, conf);
+      } catch (IOException e) {
+        System.out.println(e + ".  Exiting ...");
+        return ExitStatus.IO_EXCEPTION.getExitCode();
+      } catch (InterruptedException e) {
+        System.out.println(e + ".  Exiting ...");
+        return ExitStatus.INTERRUPTED.getExitCode();
+      } catch (ParseException e) {
+        System.out.println(e + ".  Exiting ...");
+        return ExitStatus.ILLEGAL_ARGUMENTS.getExitCode();
+      } catch (IllegalArgumentException e) {
+        System.out.println(e + ".  Exiting ...");
+        return ExitStatus.ILLEGAL_ARGUMENTS.getExitCode();
+      } finally {
+        System.out.format("%-24s ", DateFormat.getDateTimeInstance().format(new Date()));
+        System.out.println("Mover took " + StringUtils.formatTime(Time.monotonicNow()-startTime));
+      }
+    }
+  }
+
+  /**
+   * Run a Mover in command line.
+   *
+   * @param args Command line arguments
+   */
+  public static void main(String[] args) {
+    if (DFSUtil.parseHelpArgument(args, Cli.USAGE, System.out, true)) {
+      System.exit(0);
+    }
+
+    try {
+      System.exit(ToolRunner.run(new HdfsConfiguration(), new Cli(), args));
+    } catch (Throwable e) {
+      LOG.error("Exiting " + Mover.class.getSimpleName()
+          + " due to an exception", e);
+      System.exit(-1);
+    }
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
@ -52,6 +52,7 @@ import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
@ -273,6 +274,14 @@ public class FSDirectory implements Closeable {
    skipQuotaCheck = true;
  }

+  private static INodeFile newINodeFile(long id, PermissionStatus permissions,
+      long mtime, long atime, short replication, long preferredBlockSize,
+      boolean isLazyPersist) {
+    return new INodeFile(id, null, permissions, mtime, atime,
+        BlockInfo.EMPTY_ARRAY, replication, preferredBlockSize,
+        isLazyPersist, (byte) 0);
+  }
+
  /**
   * Add the given filename to the fs.
   * @throws FileAlreadyExistsException
@ -288,9 +297,9 @@ public class FSDirectory implements Closeable {
      UnresolvedLinkException, SnapshotAccessControlException, AclException {

    long modTime = now();
-    INodeFile newNode = new INodeFile(namesystem.allocateNewInodeId(), null,
-        permissions, modTime, modTime, BlockInfo.EMPTY_ARRAY, replication,
-        preferredBlockSize, isLazyPersist);
+    INodeFile newNode = newINodeFile(namesystem.allocateNewInodeId(),
+        permissions, modTime, modTime, replication, preferredBlockSize,
+        isLazyPersist);
    newNode.toUnderConstruction(clientName, clientMachine);

    boolean added = false;
@ -327,14 +336,13 @@ public class FSDirectory implements Closeable {
    final INodeFile newNode;
    assert hasWriteLock();
    if (underConstruction) {
-      newNode = new INodeFile(id, null, permissions, modificationTime,
-          modificationTime, BlockInfo.EMPTY_ARRAY, replication,
-          preferredBlockSize, isLazyPersist);
+      newNode = newINodeFile(id, permissions, modificationTime,
+          modificationTime, replication, preferredBlockSize, isLazyPersist);
      newNode.toUnderConstruction(clientName, clientMachine);

    } else {
-      newNode = new INodeFile(id, null, permissions, modificationTime, atime,
-          BlockInfo.EMPTY_ARRAY, replication, preferredBlockSize, isLazyPersist);
+      newNode = newINodeFile(id, permissions, modificationTime, atime,
+          replication, preferredBlockSize, isLazyPersist);
    }

    try {
@ -1001,6 +1009,44 @@ public class FSDirectory implements Closeable {
    return file.getBlocks();
  }

+  /** Set block storage policy for a directory */
+  void setStoragePolicy(String src, byte policyId)
+      throws IOException {
+    writeLock();
+    try {
+      unprotectedSetStoragePolicy(src, policyId);
+    } finally {
+      writeUnlock();
+    }
+  }
+
+  void unprotectedSetStoragePolicy(String src, byte policyId)
+      throws IOException {
+    assert hasWriteLock();
+    final INodesInPath iip = getINodesInPath4Write(src, true);
+    final INode inode = iip.getLastINode();
+    if (inode == null) {
+      throw new FileNotFoundException("File/Directory does not exist: " + src);
+    }
+    final int snapshotId = iip.getLatestSnapshotId();
+    if (inode.isFile()) {
+      inode.asFile().setStoragePolicyID(policyId, snapshotId);
+    } else if (inode.isDirectory()) {
+      setDirStoragePolicy(inode.asDirectory(), policyId, snapshotId);  
+    } else {
+      throw new FileNotFoundException(src + " is not a file or directory");
+    }
+  }
+
+  private void setDirStoragePolicy(INodeDirectory inode, byte policyId,
+      int latestSnapshotId) throws IOException {
+    List<XAttr> existingXAttrs = XAttrStorage.readINodeXAttrs(inode);
+    XAttr xAttr = BlockStoragePolicy.buildXAttr(policyId);
+    List<XAttr> newXAttrs = setINodeXAttrs(existingXAttrs, Arrays.asList(xAttr),
+        EnumSet.of(XAttrSetFlag.CREATE, XAttrSetFlag.REPLACE));
+    XAttrStorage.updateINodeXAttrs(inode, newXAttrs, latestSnapshotId);
+  }
+
  /**
   * @param path the file path
   * @return the block size of the file. 
@ -1332,6 +1378,11 @@ public class FSDirectory implements Closeable {
    }
  }

+  private byte getStoragePolicyID(byte inodePolicy, byte parentPolicy) {
+    return inodePolicy != BlockStoragePolicy.ID_UNSPECIFIED ? inodePolicy :
+        parentPolicy;
+  }
+
  /**
   * Get a partial listing of the indicated directory
   *
@ -1346,7 +1397,8 @@ public class FSDirectory implements Closeable {
   * @return a partial listing starting after startAfter
   */
  DirectoryListing getListing(String src, byte[] startAfter,
-      boolean needLocation) throws UnresolvedLinkException, IOException {
+      boolean needLocation, boolean isSuperUser)
+      throws UnresolvedLinkException, IOException {
    String srcs = normalizePath(src);
    final boolean isRawPath = isReservedRawName(src);

@ -1355,16 +1407,19 @@ public class FSDirectory implements Closeable {
      if (srcs.endsWith(HdfsConstants.SEPARATOR_DOT_SNAPSHOT_DIR)) {
        return getSnapshotsListing(srcs, startAfter);
      }
-      final INodesInPath inodesInPath = getLastINodeInPath(srcs, true);
+      final INodesInPath inodesInPath = getLastINodeInPath(srcs);
      final int snapshot = inodesInPath.getPathSnapshotId();
-      final INode targetNode = inodesInPath.getINode(0);
+      final INode targetNode = inodesInPath.getLastINode();
      if (targetNode == null)
        return null;
+      byte parentStoragePolicy = isSuperUser ?
+          targetNode.getStoragePolicyID() : BlockStoragePolicy.ID_UNSPECIFIED;
      
      if (!targetNode.isDirectory()) {
        return new DirectoryListing(
            new HdfsFileStatus[]{createFileStatus(HdfsFileStatus.EMPTY_NAME,
-                targetNode, needLocation, snapshot, isRawPath)}, 0);
+                targetNode, needLocation, parentStoragePolicy, snapshot,
+                isRawPath)}, 0);
      }

      final INodeDirectory dirInode = targetNode.asDirectory();
@ -1377,8 +1432,11 @@ public class FSDirectory implements Closeable {
      HdfsFileStatus listing[] = new HdfsFileStatus[numOfListing];
      for (int i=0; i<numOfListing && locationBudget>0; i++) {
        INode cur = contents.get(startChild+i);
-        listing[i] = createFileStatus(cur.getLocalNameBytes(), cur,
-            needLocation, snapshot, isRawPath);
+        byte curPolicy = isSuperUser && !cur.isSymlink()?
+            cur.getLocalStoragePolicyID(): BlockStoragePolicy.ID_UNSPECIFIED;
+        listing[i] = createFileStatus(cur.getLocalNameBytes(), cur, needLocation,
+            getStoragePolicyID(curPolicy, parentStoragePolicy), snapshot,
+            isRawPath);
        listingCnt++;
        if (needLocation) {
            // Once we  hit lsLimit locations, stop.
@ -1429,7 +1487,7 @@ public class FSDirectory implements Closeable {
    for (int i = 0; i < numOfListing; i++) {
      Root sRoot = snapshots.get(i + skipSize).getRoot();
      listing[i] = createFileStatus(sRoot.getLocalNameBytes(), sRoot,
-          Snapshot.CURRENT_STATE_ID, false);
+          BlockStoragePolicy.ID_UNSPECIFIED, Snapshot.CURRENT_STATE_ID, false);
    }
    return new DirectoryListing(
        listing, snapshots.size() - skipSize - numOfListing);
@ -1439,10 +1497,12 @@ public class FSDirectory implements Closeable {
   * @param src The string representation of the path to the file
   * @param resolveLink whether to throw UnresolvedLinkException
   * @param isRawPath true if a /.reserved/raw pathname was passed by the user
+   * @param includeStoragePolicy whether to include storage policy
   * @return object containing information regarding the file
   *         or null if file not found
   */
-  HdfsFileStatus getFileInfo(String src, boolean resolveLink, boolean isRawPath)
+  HdfsFileStatus getFileInfo(String src, boolean resolveLink,
+      boolean isRawPath, boolean includeStoragePolicy)
    throws IOException {
    String srcs = normalizePath(src);
    readLock();
@ -1452,9 +1512,10 @@ public class FSDirectory implements Closeable {
      }
      final INodesInPath inodesInPath = getLastINodeInPath(srcs, resolveLink);
      final INode i = inodesInPath.getINode(0);
-
-      return i == null? null: createFileStatus(HdfsFileStatus.EMPTY_NAME, i,
-          inodesInPath.getPathSnapshotId(), isRawPath);
+      byte policyId = includeStoragePolicy && i != null && !i.isSymlink() ?
+          i.getStoragePolicyID() : BlockStoragePolicy.ID_UNSPECIFIED;
+      return i == null ? null : createFileStatus(HdfsFileStatus.EMPTY_NAME, i,
+          policyId, inodesInPath.getPathSnapshotId(), isRawPath);
    } finally {
      readUnlock();
    }
@ -1471,7 +1532,8 @@ public class FSDirectory implements Closeable {
      throws UnresolvedLinkException {
    if (getINode4DotSnapshot(src) != null) {
      return new HdfsFileStatus(0, true, 0, 0, false, 0, 0, null, null, null, null,
-          HdfsFileStatus.EMPTY_NAME, -1L, 0, null);
+          HdfsFileStatus.EMPTY_NAME, -1L, 0, null,
+          BlockStoragePolicy.ID_UNSPECIFIED);
    }
    return null;
  }
@ -2299,19 +2361,20 @@ public class FSDirectory implements Closeable {
   * @throws IOException if any error occurs
   */
  private HdfsFileStatus createFileStatus(byte[] path, INode node,
-      boolean needLocation, int snapshot, boolean isRawPath)
+      boolean needLocation, byte storagePolicy, int snapshot, boolean isRawPath)
      throws IOException {
    if (needLocation) {
-      return createLocatedFileStatus(path, node, snapshot, isRawPath);
+      return createLocatedFileStatus(path, node, storagePolicy, snapshot, isRawPath);
    } else {
-      return createFileStatus(path, node, snapshot, isRawPath);
+      return createFileStatus(path, node, storagePolicy, snapshot, isRawPath);
    }
  }
+
  /**
   * Create FileStatus by file INode 
   */
-   HdfsFileStatus createFileStatus(byte[] path, INode node,
-       int snapshot, boolean isRawPath) throws IOException {
+  HdfsFileStatus createFileStatus(byte[] path, INode node, byte storagePolicy,
+      int snapshot, boolean isRawPath) throws IOException {
     long size = 0;     // length is zero for directories
     short replication = 0;
     long blocksize = 0;
@ -2351,14 +2414,15 @@ public class FSDirectory implements Closeable {
        path,
        node.getId(),
        childrenNum,
-        feInfo);
+        feInfo,
+        storagePolicy);
  }

  /**
   * Create FileStatus with location info by file INode
   */
-  private HdfsLocatedFileStatus createLocatedFileStatus(byte[] path,
-      INode node, int snapshot, boolean isRawPath) throws IOException {
+  private HdfsLocatedFileStatus createLocatedFileStatus(byte[] path, INode node,
+      byte storagePolicy, int snapshot, boolean isRawPath) throws IOException {
    assert hasReadLock();
    long size = 0; // length is zero for directories
    short replication = 0;
@ -2400,7 +2464,7 @@ public class FSDirectory implements Closeable {
          getPermissionForFileStatus(node, snapshot, isEncrypted),
          node.getUserName(snapshot), node.getGroupName(snapshot),
          node.isSymlink() ? node.asSymlink().getSymlink() : null, path,
-          node.getId(), loc, childrenNum, feInfo);
+          node.getId(), loc, childrenNum, feInfo, storagePolicy);
    // Set caching information for the located blocks.
    if (loc != null) {
      CacheManager cacheManager = namesystem.getCacheManager();
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
@ -82,6 +82,7 @@ import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetOwnerOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetPermissionsOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetStoragePolicyOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetXAttrOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp;
@ -831,7 +832,16 @@ public class FSEditLog implements LogsPurgeable {
      .setReplication(replication);
    logEdit(op);
  }
-  
+
+  /** 
+   * Add set storage policy id record to edit log
+   */
+  void logSetStoragePolicy(String src, byte policyId) {
+    SetStoragePolicyOp op = SetStoragePolicyOp.getInstance(cache.get())
+        .setPath(src).setPolicyId(policyId);
+    logEdit(op);
+  }
+
  /** Add set namespace quota record to edit log
   * 
   * @param src the string representation of the path to a directory
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
@ -33,6 +33,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.XAttrSetFlag;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
@ -78,6 +79,7 @@ import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetOwnerOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetPermissionsOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetStoragePolicyOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetXAttrOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RemoveXAttrOp;
 import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp;
@ -369,7 +371,8 @@ public class FSEditLogLoader {
        // add the op into retry cache if necessary
        if (toAddRetryCache) {
          HdfsFileStatus stat = fsNamesys.dir.createFileStatus(
-              HdfsFileStatus.EMPTY_NAME, newFile, Snapshot.CURRENT_STATE_ID,
+              HdfsFileStatus.EMPTY_NAME, newFile,
+              BlockStoragePolicy.ID_UNSPECIFIED, Snapshot.CURRENT_STATE_ID,
              false);
          fsNamesys.addCacheEntryWithPayload(addCloseOp.rpcClientId,
              addCloseOp.rpcCallId, stat);
@ -833,6 +836,13 @@ public class FSEditLogLoader {
      }
      break;
    }
+    case OP_SET_STORAGE_POLICY: {
+      SetStoragePolicyOp setStoragePolicyOp = (SetStoragePolicyOp) op;
+      fsDir.unprotectedSetStoragePolicy(
+          renameReservedPathsOnUpgrade(setStoragePolicyOp.path, logVersion),
+          setStoragePolicyOp.policyId);
+      break;
+    }
    default:
      throw new IOException("Invalid operation read " + op.opCode);
    }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
@ -61,6 +61,7 @@ import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.OP_SYMLINK
 import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.OP_TIMES;
 import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.OP_UPDATE_BLOCKS;
 import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.OP_UPDATE_MASTER_KEY;
+import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.OP_SET_STORAGE_POLICY;

 import java.io.DataInput;
 import java.io.DataInputStream;
@ -193,6 +194,7 @@ public abstract class FSEditLogOp {
          OP_ROLLING_UPGRADE_FINALIZE, "finalize"));
      inst.put(OP_SET_XATTR, new SetXAttrOp());
      inst.put(OP_REMOVE_XATTR, new RemoveXAttrOp());
+      inst.put(OP_SET_STORAGE_POLICY, new SetStoragePolicyOp());
    }
    
    public FSEditLogOp get(FSEditLogOpCodes opcode) {
@ -3821,6 +3823,71 @@ public abstract class FSEditLogOp {
    }
  }

+  /** {@literal @Idempotent} for {@link ClientProtocol#setStoragePolicy} */
+  static class SetStoragePolicyOp extends FSEditLogOp {
+    String path;
+    byte policyId;
+
+    private SetStoragePolicyOp() {
+      super(OP_SET_STORAGE_POLICY);
+    }
+
+    static SetStoragePolicyOp getInstance(OpInstanceCache cache) {
+      return (SetStoragePolicyOp) cache.get(OP_SET_STORAGE_POLICY);
+    }
+
+    SetStoragePolicyOp setPath(String path) {
+      this.path = path;
+      return this;
+    }
+
+    SetStoragePolicyOp setPolicyId(byte policyId) {
+      this.policyId = policyId;
+      return this;
+    }
+
+    @Override
+    public void writeFields(DataOutputStream out) throws IOException {
+      FSImageSerialization.writeString(path, out);
+      out.writeByte(policyId);
+    }
+
+    @Override
+    void readFields(DataInputStream in, int logVersion)
+        throws IOException {
+      this.path = FSImageSerialization.readString(in);
+      this.policyId = in.readByte();
+    }
+
+    @Override
+    public String toString() {
+      StringBuilder builder = new StringBuilder();
+      builder.append("SetStoragePolicyOp [path=");
+      builder.append(path);
+      builder.append(", policyId=");
+      builder.append(policyId);
+      builder.append(", opCode=");
+      builder.append(opCode);
+      builder.append(", txid=");
+      builder.append(txid);
+      builder.append("]");
+      return builder.toString();
+    }
+
+    @Override
+    protected void toXml(ContentHandler contentHandler) throws SAXException {
+      XMLUtils.addSaxString(contentHandler, "PATH", path);
+      XMLUtils.addSaxString(contentHandler, "POLICYID",
+          Byte.valueOf(policyId).toString());
+    }
+
+    @Override
+    void fromXml(Stanza st) throws InvalidXmlException {
+      this.path = st.getValue("PATH");
+      this.policyId = Byte.valueOf(st.getValue("POLICYID"));
+    }
+  }  
+
  /**
   * Class for writing editlog ops
   */
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
@ -72,6 +72,7 @@ public enum FSEditLogOpCodes {
  OP_ROLLING_UPGRADE_FINALIZE   ((byte) 42),
  OP_SET_XATTR                  ((byte) 43),
  OP_REMOVE_XATTR               ((byte) 44),
+  OP_SET_STORAGE_POLICY         ((byte) 45),

  // Note that the current range of the valid OP code is 0~127
  OP_INVALID                    ((byte) -1);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java
@ -890,7 +890,7 @@ public class FSImageFormat {
      // LazyPersist flag will not be present in old image formats and hence
      // can be safely set to false always.
      return new INodeFileAttributes.SnapshotCopy(name, permissions, null, modificationTime,
-          accessTime, replication, preferredBlockSize, false, null);
+          accessTime, replication, preferredBlockSize, false, (byte) 0, null);
    }

    public INodeDirectoryAttributes loadINodeDirectoryAttributes(DataInput in)
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatPBINode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatPBINode.java
@ -291,7 +291,8 @@ public final class FSImageFormatPBINode {
      final INodeFile file = new INodeFile(n.getId(),
          n.getName().toByteArray(), permissions, f.getModificationTime(),
          f.getAccessTime(), blocks, replication, f.getPreferredBlockSize(),
-          f.hasIsLazyPersist() ? f.getIsLazyPersist() : false);
+          f.hasIsLazyPersist() ? f.getIsLazyPersist() : false,
+          (byte)f.getStoragePolicyID());

      if (f.hasAcl()) {
        file.addAclFeature(new AclFeature(loadAclEntries(f.getAcl(),
@ -403,7 +404,8 @@ public final class FSImageFormatPBINode {
          .setPermission(buildPermissionStatus(file, state.getStringMap()))
          .setPreferredBlockSize(file.getPreferredBlockSize())
          .setReplication(file.getFileReplication())
-          .setIsLazyPersist(file.getLazyPersistFlag());
+          .setIsLazyPersist(file.getLazyPersistFlag())
+          .setStoragePolicyID(file.getLocalStoragePolicyID());

      AclFeature f = file.getAclFeature();
      if (f != null) {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
@ -67,8 +67,6 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LAZY_PERSIST_FIL
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY;
@ -164,6 +162,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.fs.permission.PermissionStatus;
 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
 import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.HAUtil;
@ -338,7 +337,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
  private HdfsFileStatus getAuditFileInfo(String path, boolean resolveSymlink)
      throws IOException {
    return (isAuditEnabled() && isExternalInvocation())
-        ? dir.getFileInfo(path, resolveSymlink, false) : null;
+        ? dir.getFileInfo(path, resolveSymlink, false, false) : null;
  }
  
  private void logAuditEvent(boolean succeeded, String cmd, String src)
@ -559,8 +558,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,

  private final FSImage fsImage;

-  private boolean randomizeBlockLocationsPerBlock;
-
  /**
   * Notify that loading of this FSDirectory is complete, and
   * it is imageLoaded for use
@ -886,10 +883,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
          DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
      
-      this.randomizeBlockLocationsPerBlock = conf.getBoolean(
-          DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK,
-          DFS_NAMENODE_RANDOMIZE_BLOCK_LOCATIONS_PER_BLOCK_DEFAULT);
-
      this.dtSecretManager = createDelegationTokenSecretManager(conf);
      this.dir = new FSDirectory(this, conf);
      this.snapshotManager = new SnapshotManager(dir);
@ -1770,7 +1763,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
        true);
    if (blocks != null) {
      blockManager.getDatanodeManager().sortLocatedBlocks(clientMachine,
-          blocks.getLocatedBlocks(), randomizeBlockLocationsPerBlock);
+          blocks.getLocatedBlocks());

      // lastBlock is not part of getLocatedBlocks(), might need to sort it too
      LocatedBlock lastBlock = blocks.getLastLocatedBlock();
@ -1779,7 +1772,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
            Lists.newArrayListWithCapacity(1);
        lastBlockList.add(lastBlock);
        blockManager.getDatanodeManager().sortLocatedBlocks(clientMachine,
-            lastBlockList, randomizeBlockLocationsPerBlock);
+            lastBlockList);
      }
    }
    return blocks;
@ -2292,6 +2285,52 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    return isFile;
  }

+  /**
+   * Set the storage policy for a file or a directory.
+   *
+   * @param src file/directory path
+   * @param policyName storage policy name
+   */
+  void setStoragePolicy(String src, final String policyName)
+      throws IOException {
+    try {
+      setStoragePolicyInt(src, policyName);
+    } catch (AccessControlException e) {
+      logAuditEvent(false, "setStoragePolicy", src);
+      throw e;
+    }
+  }
+
+  private void setStoragePolicyInt(String src, final String policyName)
+      throws IOException {
+    checkSuperuserPrivilege();
+    checkOperation(OperationCategory.WRITE);
+    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
+    waitForLoadingFSImage();
+    HdfsFileStatus fileStat;
+    writeLock();
+    try {
+      checkOperation(OperationCategory.WRITE);
+      checkNameNodeSafeMode("Cannot set storage policy for " + src);
+      src = FSDirectory.resolvePath(src, pathComponents, dir);
+
+      // get the corresponding policy and make sure the policy name is valid
+      BlockStoragePolicy policy = blockManager.getStoragePolicy(policyName);
+      if (policy == null) {
+        throw new HadoopIllegalArgumentException(
+            "Cannot find a block policy with the name " + policyName);
+      }
+      dir.setStoragePolicy(src, policy.getId());
+      getEditLog().logSetStoragePolicy(src, policy.getId());
+      fileStat = getAuditFileInfo(src, false);
+    } finally {
+      writeUnlock();
+    }
+
+    getEditLog().logSync();
+    logAuditEvent(true, "setStoragePolicy", src, null, fileStat);
+  }
+
  long getPreferredBlockSize(String filename) 
      throws IOException, UnresolvedLinkException {
    FSPermissionChecker pc = getPermissionChecker();
@ -2476,84 +2515,66 @@ public class FSNamesystem implements Namesystem, FSClusterStats,

    waitForLoadingFSImage();

-    /*
-     * We want to avoid holding any locks while doing KeyProvider operations,
-     * since they can be very slow. Since the path can
-     * flip flop between being in an encryption zone and not in the meantime,
-     * we need to recheck the preconditions and redo KeyProvider operations
-     * in some situations.
-     *
-     * A special RetryStartFileException is used to indicate that we should
-     * retry creation of a FileEncryptionInfo.
+    /**
+     * If the file is in an encryption zone, we optimistically create an
+     * EDEK for the file by calling out to the configured KeyProvider.
+     * Since this typically involves doing an RPC, we take the readLock
+     * initially, then drop it to do the RPC.
+     * 
+     * Since the path can flip-flop between being in an encryption zone and not
+     * in the meantime, we need to recheck the preconditions when we retake the
+     * lock to do the create. If the preconditions are not met, we throw a
+     * special RetryStartFileException to ask the DFSClient to try the create
+     * again later.
     */
-    BlocksMapUpdateInfo toRemoveBlocks = null;
+    CipherSuite suite = null;
+    String ezKeyName = null;
+    readLock();
    try {
-      boolean shouldContinue = true;
-      int iters = 0;
-      while (shouldContinue) {
-        skipSync = false;
-        if (iters >= 10) {
-          throw new IOException("Too many retries because of encryption zone " +
-              "operations, something might be broken!");
-        }
-        shouldContinue = false;
-        iters++;
-
-        // Optimistically determine CipherSuite and ezKeyName if the path is
-        // currently within an encryption zone
-        CipherSuite suite = null;
-        String ezKeyName = null;
-        readLock();
-        try {
-          src = resolvePath(src, pathComponents);
-          INodesInPath iip = dir.getINodesInPath4Write(src);
-          // Nothing to do if the path is not within an EZ
-          if (dir.isInAnEZ(iip)) {
-            suite = chooseCipherSuite(iip, cipherSuites);
-            if (suite != null) {
-              Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
-                  "Chose an UNKNOWN CipherSuite!");
-            }
-            ezKeyName = dir.getKeyName(iip);
-            Preconditions.checkState(ezKeyName != null);
-          }
-        } finally {
-          readUnlock();
-        }
-
-        Preconditions.checkState(
-            (suite == null && ezKeyName == null) ||
-            (suite != null && ezKeyName != null),
-            "Both suite and ezKeyName should both be null or not null");
-        // Generate EDEK if necessary while not holding the lock
-        EncryptedKeyVersion edek =
-            generateEncryptedDataEncryptionKey(ezKeyName);
-        EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
-        // Try to create the file with the computed cipher suite and EDEK
-        writeLock();
-        try {
-          checkOperation(OperationCategory.WRITE);
-          checkNameNodeSafeMode("Cannot create file" + src);
-          src = resolvePath(src, pathComponents);
-          toRemoveBlocks = startFileInternal(pc, src, permissions, holder, 
-              clientMachine, create, overwrite, createParent, replication, 
-              blockSize, isLazyPersist, suite, edek, logRetryCache);
-          stat = dir.getFileInfo(src, false,
-              FSDirectory.isReservedRawName(srcArg));
-        } catch (StandbyException se) {
-          skipSync = true;
-          throw se;
-        } catch (RetryStartFileException e) {
-          shouldContinue = true;
-          if (LOG.isTraceEnabled()) {
-            LOG.trace("Preconditions failed, retrying creation of " +
-                    "FileEncryptionInfo", e);
-          }
-        } finally {
-          writeUnlock();
+      src = resolvePath(src, pathComponents);
+      INodesInPath iip = dir.getINodesInPath4Write(src);
+      // Nothing to do if the path is not within an EZ
+      if (dir.isInAnEZ(iip)) {
+        suite = chooseCipherSuite(iip, cipherSuites);
+        if (suite != null) {
+          Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN),
+              "Chose an UNKNOWN CipherSuite!");
        }
+        ezKeyName = dir.getKeyName(iip);
+        Preconditions.checkState(ezKeyName != null);
      }
    } finally {
+      readUnlock();
+    }
+
+    Preconditions.checkState(
+        (suite == null && ezKeyName == null) ||
+            (suite != null && ezKeyName != null),
+        "Both suite and ezKeyName should both be null or not null");
+
+    // Generate EDEK if necessary while not holding the lock
+    EncryptedKeyVersion edek =
+        generateEncryptedDataEncryptionKey(ezKeyName);
+    EncryptionFaultInjector.getInstance().startFileAfterGenerateKey();
+
+    // Proceed with the create, using the computed cipher suite and 
+    // generated EDEK
+    BlocksMapUpdateInfo toRemoveBlocks = null;
+    writeLock();
+    try {
+      checkOperation(OperationCategory.WRITE);
+      checkNameNodeSafeMode("Cannot create file" + src);
+      src = resolvePath(src, pathComponents);
+      toRemoveBlocks = startFileInternal(pc, src, permissions, holder, 
+          clientMachine, create, overwrite, createParent, replication, 
+          blockSize, isLazyPersist, suite, edek, logRetryCache);
+      stat = dir.getFileInfo(src, false,
+          FSDirectory.isReservedRawName(srcArg), false);
+    } catch (StandbyException se) {
+      skipSync = true;
+      throw se;
+    } finally {
+      writeUnlock();
      // There might be transactions logged while trying to recover the lease.
      // They need to be sync'ed even when an exception was thrown.
      if (!skipSync) {
@ -3020,8 +3041,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
      throws LeaseExpiredException, NotReplicatedYetException,
      QuotaExceededException, SafeModeException, UnresolvedLinkException,
      IOException {
-    long blockSize;
-    int replication;
+    final long blockSize;
+    final int replication;
+    final byte storagePolicyID;
    DatanodeDescriptor clientNode = null;

    if(NameNode.stateChangeLog.isDebugEnabled()) {
@ -3056,13 +3078,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
      clientNode = blockManager.getDatanodeManager().getDatanodeByHost(
              pendingFile.getFileUnderConstructionFeature().getClientMachine());
      replication = pendingFile.getFileReplication();
+      storagePolicyID = pendingFile.getStoragePolicyID();
    } finally {
      readUnlock();
    }

    // choose targets for the new block to be allocated.
-    final DatanodeStorageInfo targets[] = getBlockManager().chooseTarget( 
-        src, replication, clientNode, excludedNodes, blockSize, favoredNodes);
+    final DatanodeStorageInfo targets[] = getBlockManager().chooseTarget4NewBlock( 
+        src, replication, clientNode, excludedNodes, blockSize, favoredNodes,
+        storagePolicyID);

    // Part II.
    // Allocate a new block, add it to the INode and the BlocksMap. 
@ -3250,6 +3274,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,

    final DatanodeDescriptor clientnode;
    final long preferredblocksize;
+    final byte storagePolicyID;
    final List<DatanodeStorageInfo> chosen;
    checkOperation(OperationCategory.READ);
    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
@ -3276,6 +3301,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
              .getClientMachine();
      clientnode = blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
      preferredblocksize = file.getPreferredBlockSize();
+      storagePolicyID = file.getStoragePolicyID();

      //find datanode storages
      final DatanodeManager dm = blockManager.getDatanodeManager();
@ -3285,10 +3311,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    }

    // choose new datanodes.
-    final DatanodeStorageInfo[] targets = blockManager.getBlockPlacementPolicy(
-        ).chooseTarget(src, numAdditionalNodes, clientnode, chosen, true,
-            // TODO: get storage type from the file
-        excludes, preferredblocksize, StorageType.DEFAULT);
+    final DatanodeStorageInfo[] targets = blockManager.chooseTarget4AdditionalDatanode(
+        src, numAdditionalNodes, clientnode, chosen, 
+        excludes, preferredblocksize, storagePolicyID);
    final LocatedBlock lb = new LocatedBlock(blk, targets);
    blockManager.setBlockToken(lb, AccessMode.COPY);
    return lb;
@ -3975,12 +4000,14 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    try {
      checkOperation(OperationCategory.READ);
      src = resolvePath(src, pathComponents);
+      boolean isSuperUser = true;
      if (isPermissionEnabled) {
        checkPermission(pc, src, false, null, null, null, null, false,
            resolveLink);
+        isSuperUser = pc.isSuperUser();
      }
      stat = dir.getFileInfo(src, resolveLink,
-          FSDirectory.isReservedRawName(srcArg));
+          FSDirectory.isReservedRawName(srcArg), isSuperUser);
    } catch (AccessControlException e) {
      logAuditEvent(false, "getfileinfo", srcArg);
      throw e;
@ -4209,7 +4236,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
  /**
   * Get the content summary for a specific file/dir.
   *
-   * @param src The string representation of the path to the file
+   * @param srcArg The string representation of the path to the file
   *
   * @throws AccessControlException if access is denied
   * @throws UnresolvedLinkException if a symlink is encountered.
@ -4785,16 +4812,18 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
              "Can't find startAfter " + startAfterString);
        }
      }
-      
+
+      boolean isSuperUser = true;
      if (isPermissionEnabled) {
        if (dir.isDir(src)) {
          checkPathAccess(pc, src, FsAction.READ_EXECUTE);
        } else {
          checkTraverse(pc, src);
        }
+        isSuperUser = pc.isSuperUser();
      }
      logAuditEvent(true, "listStatus", srcArg);
-      dl = dir.getListing(src, startAfter, needLocation);
+      dl = dir.getListing(src, startAfter, needLocation, isSuperUser);
    } finally {
      readUnlock();
    }
@ -4944,12 +4973,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,

  /**
   * Add the given symbolic link to the fs. Record it in the edits log.
-   * @param path
-   * @param target
-   * @param dirPerms
-   * @param createParent
-   * @param logRetryCache
-   * @param dir
   */
  private INodeSymlink addSymlink(String path, String target,
                                  PermissionStatus dirPerms,
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INode.java
@ -29,6 +29,7 @@ import org.apache.hadoop.fs.ContentSummary;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
@ -684,6 +685,20 @@ public abstract class INode implements INodeAttributes, Diff.Element<byte[]> {
    return this;
  }

+  /**
+   * @return the latest block storage policy id of the INode. Specifically,
+   * if a storage policy is directly specified on the INode then return the ID
+   * of that policy. Otherwise follow the latest parental path and return the
+   * ID of the first specified storage policy.
+   */
+  public abstract byte getStoragePolicyID();
+
+  /**
+   * @return the storage policy directly specified on the INode. Return
+   * {@link BlockStoragePolicy#ID_UNSPECIFIED} if no policy has
+   * been specified.
+   */
+  public abstract byte getLocalStoragePolicyID();

  /**
   * Breaks {@code path} into components.
@ -711,7 +726,7 @@ public abstract class INode implements INodeAttributes, Diff.Element<byte[]> {
   * @throws AssertionError if the given path is invalid.
   * @return array of path components.
   */
-  static String[] getPathNames(String path) {
+  public static String[] getPathNames(String path) {
    if (path == null || !path.startsWith(Path.SEPARATOR)) {
      throw new AssertionError("Absolute path required");
    }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeDirectory.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeDirectory.java
@ -26,7 +26,9 @@ import java.util.List;
 import java.util.Map;

 import org.apache.hadoop.fs.PathIsNotDirectoryException;
+import org.apache.hadoop.fs.XAttr;
 import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
 import org.apache.hadoop.hdfs.protocol.SnapshotException;
@ -40,6 +42,7 @@ import org.apache.hadoop.hdfs.util.ReadOnlyList;

 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;

 /**
 * Directory INode class.
@ -103,6 +106,30 @@ public class INodeDirectory extends INodeWithAdditionalFields
    return this;
  }

+  @Override
+  public byte getLocalStoragePolicyID() {
+    XAttrFeature f = getXAttrFeature();
+    ImmutableList<XAttr> xattrs = f == null ? ImmutableList.<XAttr> of() : f
+        .getXAttrs();
+    for (XAttr xattr : xattrs) {
+      if (BlockStoragePolicy.isStoragePolicyXAttr(xattr)) {
+        return (xattr.getValue())[0];
+      }
+    }
+    return BlockStoragePolicy.ID_UNSPECIFIED;
+  }
+
+  @Override
+  public byte getStoragePolicyID() {
+    byte id = getLocalStoragePolicyID();
+    if (id != BlockStoragePolicy.ID_UNSPECIFIED) {
+      return id;
+    }
+    // if it is unspecified, check its parent
+    return getParent() != null ? getParent().getStoragePolicyID() :
+        BlockStoragePolicy.ID_UNSPECIFIED;
+  }
+
  void setQuota(long nsQuota, long dsQuota) {
    DirectoryWithQuotaFeature quota = getDirectoryWithQuotaFeature();
    if (quota != null) {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeDirectoryAttributes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeDirectoryAttributes.java
@ -18,10 +18,12 @@
 package org.apache.hadoop.hdfs.server.namenode;

 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.XAttr;
 import org.apache.hadoop.fs.permission.PermissionStatus;
-import org.apache.hadoop.hdfs.server.namenode.XAttrFeature;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;

 import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;

 /**
 * The attributes of an inode.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFile.java
@ -28,6 +28,7 @@ import java.util.List;

 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection;
@ -71,12 +72,15 @@ public class INodeFile extends INodeWithAdditionalFields
    return inode.asFile();
  }

-  /** Format: [16 bits for replication][48 bits for PreferredBlockSize] */
+  /** 
+   * Bit format:
+   * [4-bit storagePolicyID][12-bit replication][48-bit preferredBlockSize]
+   */
  static enum HeaderFormat {
    PREFERRED_BLOCK_SIZE(null, 48, 1),
-    REPLICATION(PREFERRED_BLOCK_SIZE.BITS, 12, 1),
-    LAZY_PERSIST(REPLICATION.BITS, 4, 0);
-
+    REPLICATION(PREFERRED_BLOCK_SIZE.BITS, 11, 1),
+    STORAGE_POLICY_ID(REPLICATION.BITS, BlockStoragePolicy.ID_BIT_LENGTH, 0),
+    LAZY_PERSIST(STORAGE_POLICY_ID.BITS, 1, 0);

    private final LongBitFormat BITS;

@ -96,10 +100,16 @@ public class INodeFile extends INodeWithAdditionalFields
      return LAZY_PERSIST.BITS.retrieve(header) == 0 ? false : true;
    }

-    static long toLong(long preferredBlockSize, short replication, boolean isLazyPersist) {
+    static byte getStoragePolicyID(long header) {
+      return (byte)STORAGE_POLICY_ID.BITS.retrieve(header);
+    }
+
+    static long toLong(long preferredBlockSize, short replication,
+        boolean isLazyPersist, byte storagePolicyID) {
      long h = 0;
      h = PREFERRED_BLOCK_SIZE.BITS.combine(preferredBlockSize, h);
      h = REPLICATION.BITS.combine(replication, h);
+      h = STORAGE_POLICY_ID.BITS.combine(storagePolicyID, h);
      h = LAZY_PERSIST.BITS.combine(isLazyPersist ? 1 : 0, h);
      return h;
    }
@ -114,14 +124,15 @@ public class INodeFile extends INodeWithAdditionalFields
            long atime, BlockInfo[] blklist, short replication,
            long preferredBlockSize) {
    this(id, name, permissions, mtime, atime, blklist, replication,
-         preferredBlockSize, false);
+         preferredBlockSize, false, (byte) 0);
  }

  INodeFile(long id, byte[] name, PermissionStatus permissions, long mtime,
      long atime, BlockInfo[] blklist, short replication,
-      long preferredBlockSize, boolean isLazyPersist) {
+      long preferredBlockSize, boolean isLazyPersist, byte storagePolicyID) {
    super(id, name, permissions, mtime, atime);
-    header = HeaderFormat.toLong(preferredBlockSize, replication, isLazyPersist);
+    header = HeaderFormat.toLong(preferredBlockSize, replication,
+        isLazyPersist, storagePolicyID);
    this.blocks = blklist;
  }
  
@ -374,6 +385,32 @@ public class INodeFile extends INodeWithAdditionalFields
    return HeaderFormat.getLazyPersistFlag(header);
  }

+  @Override
+  public byte getLocalStoragePolicyID() {
+    return HeaderFormat.getStoragePolicyID(header);
+  }
+
+  @Override
+  public byte getStoragePolicyID() {
+    byte id = getLocalStoragePolicyID();
+    if (id == BlockStoragePolicy.ID_UNSPECIFIED) {
+      return this.getParent() != null ?
+          this.getParent().getStoragePolicyID() : id;
+    }
+    return id;
+  }
+
+  private void setStoragePolicyID(byte storagePolicyId) {
+    header = HeaderFormat.STORAGE_POLICY_ID.BITS.combine(storagePolicyId,
+        header);
+  }
+
+  public final void setStoragePolicyID(byte storagePolicyId,
+      int latestSnapshotId) throws QuotaExceededException {
+    recordModification(latestSnapshotId);
+    setStoragePolicyID(storagePolicyId);
+  }
+
  @Override
  public long getHeaderLong() {
    return header;
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileAttributes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileAttributes.java
@ -39,6 +39,8 @@ public interface INodeFileAttributes extends INodeAttributes {

  public boolean metadataEquals(INodeFileAttributes other);

+  public byte getLocalStoragePolicyID();
+
  /** A copy of the inode file attributes */
  public static class SnapshotCopy extends INodeAttributes.SnapshotCopy
      implements INodeFileAttributes {
@ -46,11 +48,12 @@ public interface INodeFileAttributes extends INodeAttributes {

    public SnapshotCopy(byte[] name, PermissionStatus permissions,
        AclFeature aclFeature, long modificationTime, long accessTime,
-        short replication, long preferredBlockSize,
-        boolean isTransient, XAttrFeature xAttrsFeature) {
+        short replication, long preferredBlockSize, boolean isLazyPersist,
+        byte storagePolicyID, XAttrFeature xAttrsFeature) {
      super(name, permissions, aclFeature, modificationTime, accessTime, 
          xAttrsFeature);
-      header = HeaderFormat.toLong(preferredBlockSize, replication, isTransient);
+      header = HeaderFormat.toLong(preferredBlockSize, replication,
+          isLazyPersist, storagePolicyID);
    }

    public SnapshotCopy(INodeFile file) {
@ -71,6 +74,11 @@ public interface INodeFileAttributes extends INodeAttributes {
    @Override
    public boolean getLazyPersistFlag() { return HeaderFormat.getLazyPersistFlag(header); }

+    @Override
+    public byte getLocalStoragePolicyID() {
+      return HeaderFormat.getStoragePolicyID(header);
+    }
+
    @Override
    public long getHeaderLong() {
      return header;
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java
@ -22,6 +22,7 @@ import java.util.List;

 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
 import org.apache.hadoop.hdfs.server.namenode.Quota.Counts;
 import org.apache.hadoop.util.GSet;
@ -121,6 +122,16 @@ public class INodeMap {
          boolean countDiffChange) throws QuotaExceededException {
        return null;
      }
+
+      @Override
+      public byte getStoragePolicyID(){
+        return BlockStoragePolicy.ID_UNSPECIFIED;
+      }
+
+      @Override
+      public byte getLocalStoragePolicyID() {
+        return BlockStoragePolicy.ID_UNSPECIFIED;
+      }
    };
      
    return map.get(inode);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeReference.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeReference.java
@ -285,6 +285,16 @@ public abstract class INodeReference extends INode {
    referred.setAccessTime(accessTime);
  }

+  @Override
+  public final byte getStoragePolicyID() {
+    return referred.getStoragePolicyID();
+  }
+
+  @Override
+  public final byte getLocalStoragePolicyID() {
+    return referred.getLocalStoragePolicyID();
+  }
+
  @Override
  final void recordModification(int latestSnapshotId)
      throws QuotaExceededException {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeSymlink.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeSymlink.java
@ -145,4 +145,16 @@ public class INodeSymlink extends INodeWithAdditionalFields {
  public void addXAttrFeature(XAttrFeature f) {
    throw new UnsupportedOperationException("XAttrs are not supported on symlinks");
  }
+
+  @Override
+  public byte getStoragePolicyID() {
+    throw new UnsupportedOperationException(
+        "Storage policy are not supported on symlinks");
+  }
+
+  @Override
+  public byte getLocalStoragePolicyID() {
+    throw new UnsupportedOperationException(
+        "Storage policy are not supported on symlinks");
+  }
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeLayoutVersion.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeLayoutVersion.java
@ -67,11 +67,12 @@ public class NameNodeLayoutVersion {
    EDITLOG_LENGTH(-56, "Add length field to every edit log op"),
    XATTRS(-57, "Extended attributes"),
    CREATE_OVERWRITE(-58, "Use single editlog record for " +
-        "creating file with overwrite"),
+      "creating file with overwrite"),
    XATTRS_NAMESPACE_EXT(-59, "Increase number of xattr namespaces"),
+    BLOCK_STORAGE_POLICY(-60, "Block Storage policy"),
    LAZY_PERSIST_FILES(-60, "Support for optional lazy persistence of " + 
-        " files with reduced durability guarantees");
-
+        " files with reduced durability guarantees");    
+    
    private final FeatureInfo info;

    /**
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
@ -585,7 +585,13 @@ class NameNodeRpcServer implements NamenodeProtocols {
    throws IOException {  
    return namesystem.setReplication(src, replication);
  }
-    
+
+  @Override
+  public void setStoragePolicy(String src, String policyName)
+      throws IOException {
+    namesystem.setStoragePolicy(src, policyName);
+  }
+
  @Override // ClientProtocol
  public void setPermission(String src, FsPermission permissions)
      throws IOException {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/RetryStartFileException.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/RetryStartFileException.java
@ -17,5 +17,20 @@
 */
 package org.apache.hadoop.hdfs.server.namenode;

-public class RetryStartFileException extends Exception {
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+@InterfaceAudience.Private
+public class RetryStartFileException extends IOException {
+  private static final long serialVersionUID = 1L;
+
+  public RetryStartFileException() {
+    super("Preconditions for creating a file failed because of a " +
+        "transient error, retry create later.");
+  }
+
+  public RetryStartFileException(String s) {
+    super(s);
+  }
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java
@ -222,7 +222,7 @@ public class FSImageFormatPBSnapshot {
              fileInPb.getAccessTime(), (short) fileInPb.getReplication(),
              fileInPb.getPreferredBlockSize(),
              fileInPb.hasIsLazyPersist() ? fileInPb.getIsLazyPersist() : false,
-              xAttrs);
+              (byte)fileInPb.getStoragePolicyID(), xAttrs);
        }

        FileDiff diff = new FileDiff(pbf.getSnapshotId(), copy, null,
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/web/resources/NamenodeWebHdfsMethods.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/web/resources/NamenodeWebHdfsMethods.java
@ -26,7 +26,6 @@ import java.net.InetAddress;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
 import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.List;
@ -223,11 +222,8 @@ public class NamenodeWebHdfsMethods {
      final DatanodeDescriptor clientNode = bm.getDatanodeManager(
          ).getDatanodeByHost(getRemoteAddress());
      if (clientNode != null) {
-        final DatanodeStorageInfo[] storages = bm.getBlockPlacementPolicy()
-            .chooseTarget(path, 1, clientNode,
-                new ArrayList<DatanodeStorageInfo>(), false, excludes, blocksize,
-                // TODO: get storage type from the file
-                StorageType.DEFAULT);
+        final DatanodeStorageInfo[] storages = bm.chooseTarget4WebHDFS(
+            path, clientNode, excludes, blocksize);
        if (storages.length > 0) {
          return storages[0].getDatanodeDescriptor();
        }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java
@ -18,6 +18,7 @@
 package org.apache.hadoop.hdfs.tools;

 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.net.InetSocketAddress;
@ -43,6 +44,7 @@ import org.apache.hadoop.fs.FsStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.shell.Command;
 import org.apache.hadoop.fs.shell.CommandFormat;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
@ -58,23 +60,24 @@ import org.apache.hadoop.hdfs.protocol.HdfsConstants;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants.RollingUpgradeAction;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
+import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
 import org.apache.hadoop.hdfs.protocol.RollingUpgradeInfo;
 import org.apache.hadoop.hdfs.protocol.SnapshotException;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
-import org.apache.hadoop.ipc.RPC;
-import org.apache.hadoop.ipc.RemoteException;
-import org.apache.hadoop.ipc.ProtobufRpcEngine;
-import org.apache.hadoop.ipc.RefreshCallQueueProtocol;
 import org.apache.hadoop.ipc.GenericRefreshProtocol;
+import org.apache.hadoop.ipc.ProtobufRpcEngine;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.ipc.RefreshCallQueueProtocol;
 import org.apache.hadoop.ipc.RefreshResponse;
+import org.apache.hadoop.ipc.RemoteException;
+import org.apache.hadoop.ipc.protocolPB.GenericRefreshProtocolClientSideTranslatorPB;
+import org.apache.hadoop.ipc.protocolPB.GenericRefreshProtocolPB;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.security.RefreshUserMappingsProtocol;
 import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
-import org.apache.hadoop.ipc.protocolPB.GenericRefreshProtocolClientSideTranslatorPB;
-import org.apache.hadoop.ipc.protocolPB.GenericRefreshProtocolPB;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.ToolRunner;

@ -384,6 +387,8 @@ public class DFSAdmin extends FsShell {
    "\t[-shutdownDatanode <datanode_host:ipc_port> [upgrade]]\n" +
    "\t[-getDatanodeInfo <datanode_host:ipc_port>]\n" +
    "\t[-metasave filename]\n" +
+    "\t[-setStoragePolicy path policyName]\n" +
+    "\t[-getStoragePolicy path]\n" +
    "\t[-help [cmd]]\n";

  /**
@ -589,6 +594,32 @@ public class DFSAdmin extends FsShell {
    return inSafeMode;
  }

+  public int setStoragePolicy(String[] argv) throws IOException {
+    DistributedFileSystem dfs = getDFS();
+    dfs.setStoragePolicy(new Path(argv[1]), argv[2]);
+    System.out.println("Set storage policy " + argv[2] + " on " + argv[1]);
+    return 0;
+  }
+
+  public int getStoragePolicy(String[] argv) throws IOException {
+    DistributedFileSystem dfs = getDFS();
+    HdfsFileStatus status = dfs.getClient().getFileInfo(argv[1]);
+    if (status == null) {
+      throw new FileNotFoundException("File/Directory does not exist: "
+          + argv[1]);
+    }
+    byte storagePolicyId = status.getStoragePolicy();
+    BlockStoragePolicy.Suite suite = BlockStoragePolicy
+        .readBlockStorageSuite(getConf());
+    BlockStoragePolicy policy = suite.getPolicy(storagePolicyId);
+    if (policy != null) {
+      System.out.println("The storage policy of " + argv[1] + ":\n" + policy);
+      return 0;
+    } else {
+      throw new IOException("Cannot identify the storage policy for " + argv[1]);
+    }
+  }
+
  /**
   * Allow snapshot on a directory.
   * Usage: java DFSAdmin -allowSnapshot snapshotDir
@ -930,7 +961,13 @@ public class DFSAdmin extends FsShell {
    String getDatanodeInfo = "-getDatanodeInfo <datanode_host:ipc_port>\n"
        + "\tGet the information about the given datanode. This command can\n"
        + "\tbe used for checking if a datanode is alive.\n";
-    
+
+    String setStoragePolicy = "-setStoragePolicy path policyName\n"
+        + "\tSet the storage policy for a file/directory.\n";
+
+    String getStoragePolicy = "-getStoragePolicy path\n"
+        + "\tGet the storage policy for a file/directory.\n";
+
    String help = "-help [cmd]: \tDisplays help for the given command or all commands if none\n" +
      "\t\tis specified.\n";

@ -988,6 +1025,10 @@ public class DFSAdmin extends FsShell {
      System.out.println(shutdownDatanode);
    } else if ("getDatanodeInfo".equalsIgnoreCase(cmd)) {
      System.out.println(getDatanodeInfo);
+    } else if ("setStoragePolicy".equalsIgnoreCase(cmd))  {
+      System.out.println(setStoragePolicy);
+    } else if ("getStoragePolicy".equalsIgnoreCase(cmd))  {
+      System.out.println(getStoragePolicy);
    } else if ("help".equals(cmd)) {
      System.out.println(help);
    } else {
@ -1019,6 +1060,8 @@ public class DFSAdmin extends FsShell {
      System.out.println(disallowSnapshot);
      System.out.println(shutdownDatanode);
      System.out.println(getDatanodeInfo);
+      System.out.println(setStoragePolicy);
+      System.out.println(getStoragePolicy);
      System.out.println(help);
      System.out.println();
      ToolRunner.printGenericCommandUsage(System.out);
@ -1378,6 +1421,12 @@ public class DFSAdmin extends FsShell {
    } else if ("-safemode".equals(cmd)) {
      System.err.println("Usage: hdfs dfsadmin"
          + " [-safemode enter | leave | get | wait]");
+    } else if ("-setStoragePolicy".equals(cmd)) {
+      System.err.println("Usage: java DFSAdmin"
+          + " [-setStoragePolicy path policyName]");
+    } else if ("-getStoragePolicy".equals(cmd)) {
+      System.err.println("Usage: java DFSAdmin"
+          + " [-getStoragePolicy path]");
    } else if ("-allowSnapshot".equalsIgnoreCase(cmd)) {
      System.err.println("Usage: hdfs dfsadmin"
          + " [-allowSnapshot <snapshotDir>]");
@ -1586,6 +1635,16 @@ public class DFSAdmin extends FsShell {
        printUsage(cmd);
        return exitCode;
      }
+    } else if ("-setStoragePolicy".equals(cmd)) {
+      if (argv.length != 3) {
+        printUsage(cmd);
+        return exitCode;
+      }
+    } else if ("-getStoragePolicy".equals(cmd)) {
+      if (argv.length != 2) {
+        printUsage(cmd);
+        return exitCode;
+      }
    }
    
    // initialize DFSAdmin
@ -1657,6 +1716,10 @@ public class DFSAdmin extends FsShell {
        exitCode = shutdownDatanode(argv, i);
      } else if ("-getDatanodeInfo".equals(cmd)) {
        exitCode = getDatanodeInfo(argv, i);
+      } else if ("-setStoragePolicy".equals(cmd)) {
+        exitCode = setStoragePolicy(argv);
+      } else if ("-getStoragePolicy".equals(cmd)) {
+        exitCode = getStoragePolicy(argv);
      } else if ("-help".equals(cmd)) {
        if (i < argv.length) {
          printHelp(argv[i]);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/EnumCounters.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/EnumCounters.java
@ -105,6 +105,15 @@ public class EnumCounters<E extends Enum<E>> {
      this.counters[i] -= that.counters[i];
    }
  }
+  
+  /** @return the sum of all counters. */
+  public final long sum() {
+    long sum = 0;
+    for(int i = 0; i < counters.length; i++) {
+      sum += counters[i];
+    }
+    return sum;
+  }

  @Override
  public boolean equals(Object obj) {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/web/JsonUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/web/JsonUtil.java
@ -21,6 +21,7 @@ import org.apache.hadoop.fs.*;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.XAttrHelper;
 import org.apache.hadoop.hdfs.protocol.*;
@ -230,6 +231,7 @@ public class JsonUtil {
    m.put("replication", status.getReplication());
    m.put("fileId", status.getFileId());
    m.put("childrenNum", status.getChildrenNum());
+    m.put("storagePolicy", status.getStoragePolicy());
    return includeType ? toJsonString(FileStatus.class, m): JSON.toString(m);
  }

@ -262,9 +264,13 @@ public class JsonUtil {
    Long childrenNumLong = (Long) m.get("childrenNum");
    final int childrenNum = (childrenNumLong == null) ? -1
            : childrenNumLong.intValue();
+    final byte storagePolicy = m.containsKey("storagePolicy") ?
+        (byte) (long) (Long) m.get("storagePolicy") :
+          BlockStoragePolicy.ID_UNSPECIFIED;
    return new HdfsFileStatus(len, type == PathType.DIRECTORY, replication,
        blockSize, isLazyPersist, mTime, aTime, permission, owner, group,
-        symlink, DFSUtil.string2Bytes(localName), fileId, childrenNum, null);
+        symlink, DFSUtil.string2Bytes(localName), fileId, childrenNum, null,
+        storagePolicy);
  }

  /** Convert an ExtendedBlock to a Json map. */
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/ClientNamenodeProtocol.proto
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/ClientNamenodeProtocol.proto
@ -101,6 +101,14 @@ message SetReplicationResponseProto {
  required bool result = 1;
 }

+message SetStoragePolicyRequestProto {
+  required string src = 1;
+  required string policyName = 2;
+}
+
+message SetStoragePolicyResponseProto { // void response
+}
+
 message SetPermissionRequestProto {
  required string src = 1;
  required FsPermissionProto permission = 2;
@ -690,6 +698,8 @@ service ClientNamenodeProtocol {
  rpc append(AppendRequestProto) returns(AppendResponseProto);
  rpc setReplication(SetReplicationRequestProto)
      returns(SetReplicationResponseProto);
+  rpc setStoragePolicy(SetStoragePolicyRequestProto)
+      returns(SetStoragePolicyResponseProto);
  rpc setPermission(SetPermissionRequestProto)
      returns(SetPermissionResponseProto);
  rpc setOwner(SetOwnerRequestProto) returns(SetOwnerResponseProto);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/fsimage.proto
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/fsimage.proto
@ -138,7 +138,8 @@ message INodeSection {
    optional FileUnderConstructionFeature fileUC = 7;
    optional AclFeatureProto acl = 8;
    optional XAttrFeatureProto xAttrs = 9;
-    optional bool isLazyPersist = 10 [default = false];
+    optional uint32 storagePolicyID = 10;
+    optional bool isLazyPersist = 11 [default = false];
  }

  message INodeDirectory {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/hdfs.proto
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/hdfs.proto
@ -158,7 +158,8 @@ message FsPermissionProto {
 enum StorageTypeProto {
  DISK = 1;
  SSD = 2;
-  RAM_DISK = 3;
+  ARCHIVE = 3;
+  RAM_DISK = 4;
 }

 /**
@ -263,7 +264,9 @@ message HdfsFileStatusProto {
  optional int32 childrenNum = 14 [default = -1];
  // Optional field for file encryption
  optional FileEncryptionInfoProto fileEncryptionInfo = 15;
-  optional bool isLazyPersist = 16 [default = false];
+
+  optional uint32 storagePolicy = 16 [default = 0]; // block storage policy id
+  optional bool isLazyPersist = 17 [default = false];
 } 

 /**
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/blockStoragePolicy-default.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/blockStoragePolicy-default.xml
@ -0,0 +1,118 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<!-- Do not modify this file directly.  Instead, copy entries that you wish -->
+<!-- to modify from this file into blockStoragePolicy-site.xml and change   -->
+<!-- there.  If blockStoragePolicy-site.xml does not exist, create it.      -->
+
+<configuration>
+<property>
+  <name>dfs.block.storage.policies</name>
+  <value>HOT:12, WARM:8, COLD:4</value>
+  <description>
+    A list of block storage policy names and IDs.  The syntax is
+
+      NAME_1:ID_1, NAME_2:ID_2, ..., NAME_n:ID_n
+
+    where ID is an integer in the range [1,15] and NAME is case insensitive.
+    The first element is the default policy.  Empty list is not allowed.
+  </description>
+</property>
+
+<!-- Block Storage Policy HOT:12 -->
+<property>
+  <name>dfs.block.storage.policy.12</name>
+  <value>DISK</value>
+  <description>
+    A list of storage types for storing the block replicas such as
+
+      STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_n
+  
+    When creating a block, the i-th replica is stored using i-th storage type
+    for i less than or equal to n, and
+    the j-th replica is stored using n-th storage type for j greater than n.
+
+    Empty list is not allowed.
+
+    Examples:
+    DISK          : all replicas stored using DISK.
+    DISK, ARCHIVE : the first replica is stored using DISK and all the
+                    remaining replicas are stored using ARCHIVE.
+  </description>
+</property>
+
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.12</name>
+  <value></value>
+  <description>
+    A list of storage types for creation fallback storage.
+
+      STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_n
+  
+    When creating a block, if a particular storage type specified in the policy
+    is unavailable, the fallback STORAGE_TYPE_1 is used.  Further, if
+    STORAGE_TYPE_i is also unavailable, the fallback STORAGE_TYPE_(i+1) is used.
+    In case that all fallback storages are unavailabe, the block will be created
+    with number of replicas less than the specified replication factor.
+
+    An empty list indicates that there is no fallback storage.
+  </description>
+</property>
+
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.12</name>
+  <value>ARCHIVE</value>
+  <description>
+    Similar to dfs.block.storage.policy.creation-fallback.x but for replication.
+  </description>
+</property>
+
+<!-- Block Storage Policy WARM:8 -->
+<property>
+  <name>dfs.block.storage.policy.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+
+<!-- Block Storage Policy COLD:4 -->
+<property>
+  <name>dfs.block.storage.policy.4</name>
+  <value>ARCHIVE</value>
+</property>
+
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.4</name>
+  <value></value>
+</property>
+
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.4</name>
+  <value></value>
+</property>
+</configuration>
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
@ -22,7 +22,8 @@
 <!-- wish to modify from this file into hdfs-site.xml and change them -->
 <!-- there.  If hdfs-site.xml does not already exist, create it.      -->

-<configuration>
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude"> 
+  <xi:include href="blockStoragePolicy-default.xml" />

 <property>
  <name>hadoop.hdfs.configuration.version</name>
@ -2078,19 +2079,6 @@
  </description>
 </property>

-<property>
-  <name>dfs.namenode.randomize-block-locations-per-block</name>
-  <value>false</value>
-  <description>When fetching replica locations of a block, the replicas
-   are sorted based on network distance. This configuration parameter
-   determines whether the replicas at the same network distance are randomly
-   shuffled. By default, this is false, such that repeated requests for a block's
-   replicas always result in the same order. This potentially improves page cache
-   behavior. However, for some network topologies, it is desirable to shuffle this
-   order for better load balancing.
-  </description>
-</property>
-
 <property>
  <name>dfs.datanode.block.id.layout.upgrade.threads</name>
  <value>12</value>
--- a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/ArchivalStorage.apt.vm
@ -0,0 +1,302 @@
+~~ Licensed under the Apache License, Version 2.0 (the "License");
+~~ you may not use this file except in compliance with the License.
+~~ You may obtain a copy of the License at
+~~
+~~   http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License. See accompanying LICENSE file.
+
+  ---
+  HDFS Archival Storage
+  ---
+  ---
+  ${maven.build.timestamp}
+
+HDFS Archival Storage
+
+%{toc|section=1|fromDepth=0}
+
+* {Introduction}
+
+  <Archival Storage> is a solution to decouple growing storage capacity from compute capacity.
+  Nodes with higher density and less expensive storage with low compute power are becoming available
+  and can be used as cold storage in the clusters.
+  Based on policy the data from hot can be moved to the cold.
+  Adding more nodes to the cold storage can grow the storage independent of the compute capacity
+  in the cluster.
+
+* {Storage Types and Storage Policies}
+
+** {Storage Types: DISK, SSD and ARCHIVE}
+
+  The first phase of
+  {{{https://issues.apache.org/jira/browse/HDFS-2832}Heterogeneous Storage (HDFS-2832)}}
+  changed datanode storage model from a single storage,
+  which may correspond to multiple physical storage medias,
+  to a collection of storages with each storage corresponding to a physical storage media.
+  It also added the notion of storage types, DISK and SSD,
+  where DISK is the default storage type.
+
+  A new storage type <ARCHIVE>,
+  which has high storage density (petabyte of storage) but little compute power,
+  is added for supporting archival storage.
+
+** {Storage Policies: Hot, Warm and Cold}
+
+  A new concept of storage policies is introduced in order to allow files to be stored
+  in different storage types according to the storage policy.
+
+  We have the following storage policies:
+
+  * <<Hot>> - for both storage and compute.
+              The data that is popular and still being used for processing will stay in this policy.
+              When a block is hot, all replicas are stored in DISK.
+
+  * <<Cold>> - only for storage with limited compute.
+               The data that is no longer being used, or data that needs to be archived is moved
+               from hot storage to cold storage.
+               When a block is cold, all replicas are stored in ARCHIVE.
+
+  * <<Warm>> - partially hot and partially cold.
+               When a block is warm, some of its replicas are stored in DISK
+               and the remaining replicas are stored in ARCHIVE.
+
+  []
+
+  More formally, a storage policy consists of the following fields:
+
+  [[1]] Policy ID
+
+  [[2]] Policy name
+
+  [[3]] A list of storage types for block placement
+
+  [[4]] A list of fallback storage types for file creation
+
+  [[5]] A list of fallback storage types for replication
+
+  []
+
+  When there is enough space,
+  block replicas are stored according to the storage type list specified in #3.
+  When some of the storage types in list #3 are running out of space,
+  the fallback storage type lists specified in #4 and #5 are used
+  to replace the out-of-space storage types for file creation and replication, respectively.
+
+  The following is a typical storage policy table.
+
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| <<Policy>> | <<Policy>>| <<Block Placement>>     | <<Fallback storages>> | <<Fallback storages>> |
+| <<ID>>     | <<Name>>  | <<(n\ replicas)>>      | <<for creation>>      | <<for replication>>   |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| 12     | Hot (default) | DISK: <n>               | \<none\>              | ARCHIVE               |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| 8      | Warm          | DISK: 1, ARCHIVE: <n>-1 | ARCHIVE, DISK         | ARCHIVE, DISK         |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+| 4      | Cold          | ARCHIVE: <n>            | \<none\>              | \<none\>              |
+*--------+---------------+-------------------------+-----------------------+-----------------------+
+
+  Note that cluster administrators may change the storage policy table
+  according to the characteristic of the cluster.
+  For example, in order to prevent losing archival data,
+  administrators may want to use DISK as fallback storage for replication in the Cold policy.
+  A drawback of such setting is that the DISK storages could be filled up with archival data.
+  As a result, the entire cluster may become full and cannot serve hot data anymore.
+
+** {Configurations}
+
+*** {Setting The List of All Storage Policies}
+
+  * <<dfs.block.storage.policies>>
+    - a list of block storage policy names and IDs.
+    The syntax is
+
+      NAME_1:ID_1, NAME_2:ID_2, ..., NAME_<n>:ID_<n>
+
+    where ID is an integer in the closed range [1,15] and NAME is case insensitive.
+    The first element is the <default policy>.  Empty list is not allowed.
+
+    The default value is shown below.
+
+------------------------------------------+
+<property>
+  <name>dfs.block.storage.policies</name>
+  <value>HOT:12, WARM:8, COLD:4</value>
+</property>
+------------------------------------------+
+
+  []
+
+*** {Setting Storage Policy Details}
+
+  The following configuration properties are for setting the details of each storage policy,
+  where <<<\<ID\>>>> is the actual policy ID.
+
+  * <<dfs.block.storage.policy.\<ID\>>>
+    - a list of storage types for storing the block replicas.
+    The syntax is
+
+      STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_<n>
+  
+    When creating a block, the <i>-th replica is stored using <i>-th storage type
+    for <i> less than or equal to <n>, and
+    the <j>-th replica is stored using <n>-th storage type for <j> greater than <n>.
+
+    Empty list is not allowed.
+
+    Examples:
+
+------------------------------------------+
+DISK          : all replicas stored using DISK.
+DISK, ARCHIVE : the first replica is stored using DISK and all the
+                remaining replicas are stored using ARCHIVE.
+------------------------------------------+
+
+  * <<dfs.block.storage.policy.creation-fallback.\<ID\>>>
+    - a list of storage types for creation fallback storage.
+    The syntax is
+
+      STORAGE_TYPE_1, STORAGE_TYPE_2, ..., STORAGE_TYPE_n
+  
+    When creating a block, if a particular storage type specified in the policy
+    is unavailable, the fallback STORAGE_TYPE_1 is used.  Further, if
+    STORAGE_TYPE_<i> is also unavailable, the fallback STORAGE_TYPE_<(i+1)> is used.
+    In case all fallback storages are unavailable, the block will be created
+    with number of replicas less than the specified replication factor.
+
+    An empty list indicates that there is no fallback storage.
+
+  * <<dfs.block.storage.policy.replication-fallback.\<ID\>>>
+    - a list of storage types for replication fallback storage.
+    The usage of this configuration property is similar to
+    <<<dfs.block.storage.policy.creation-fallback.\<ID\>>>>
+    except that it takes effect on replication but not block creation.
+
+  []
+
+  The following are the default configuration values for Hot, Warm and Cold storage policies.
+
+  * Block Storage Policy <<HOT:12>>
+
+------------------------------------------+
+<property>
+  <name>dfs.block.storage.policy.12</name>
+  <value>DISK</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.12</name>
+  <value></value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.12</name>
+  <value>ARCHIVE</value>
+</property>
+------------------------------------------+
+
+  * Block Storage Policy <<WARM:8>>
+
+------------------------------------------+
+<property>
+  <name>dfs.block.storage.policy.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.8</name>
+  <value>DISK, ARCHIVE</value>
+</property>
+------------------------------------------+
+
+  * Block Storage Policy <<COLD:4>>
+
+------------------------------------------+
+<property>
+  <name>dfs.block.storage.policy.4</name>
+  <value>ARCHIVE</value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.creation-fallback.4</name>
+  <value></value>
+</property>
+<property>
+  <name>dfs.block.storage.policy.replication-fallback.4</name>
+  <value></value>
+</property>
+------------------------------------------+
+
+  []
+
+* {Mover - A New Data Migration Tool}
+
+  A new data migration tool is added for archiving data.
+  The tool is similar to Balancer.
+  It periodically scans the files in HDFS to check if the block placement satisfies the storage policy.
+  For the blocks violating the storage policy,
+  it moves the replicas to a different storage type
+  in order to fulfill the storage policy requirement.
+
+  * Command:
+
+------------------------------------------+
+hdfs mover [-p <files/dirs> | -f <local file name>]
+------------------------------------------+
+
+  * Arguments:
+
+*-------------------------+--------------------------------------------------------+
+| <<<-p \<files/dirs\>>>> | Specify a space separated list of HDFS files/dirs to migrate.
+*-------------------------+--------------------------------------------------------+
+| <<<-f \<local file\>>>> | Specify a local file containing a list of HDFS files/dirs to migrate.
+*-------------------------+--------------------------------------------------------+
+
+  Note that, when both -p and -f options are omitted, the default path is the root directory.
+
+  []
+
+
+* {<<<DFSAdmin>>> Commands}
+
+** {Set Storage Policy}
+
+  Set a storage policy to a file or a directory.
+
+  * Command:
+
+------------------------------------------+
+hdfs dfsadmin -setStoragePolicy <path> <policyName>
+------------------------------------------+
+
+  * Arguments:
+
+*----------------------+-----------------------------------------------------+
+| <<<\<path\>>>>       | The path referring to either a directory or a file. |
+*----------------------+-----------------------------------------------------+
+| <<<\<policyName\>>>> | The name of the storage policy.                     |
+*----------------------+-----------------------------------------------------+
+
+  []
+
+** {Get Storage Policy}
+
+  Get the storage policy of a file or a directory.
+
+  * Command:
+
+------------------------------------------+
+hdfs dfsadmin -getStoragePolicy <path>
+------------------------------------------+
+
+  * Arguments:
+
+*----------------------+-----------------------------------------------------+
+| <<<\<path\>>>>       | The path referring to either a directory or a file. |
+*----------------------+-----------------------------------------------------+
+
+  []
--- a/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/apt/HDFSCommands.apt.vm
@ -147,18 +147,19 @@ HDFS Commands Guide
 *-----------------+-----------------------------------------------------------+
 | -regular        | Normal datanode startup (default).
 *-----------------+-----------------------------------------------------------+
-| -rollback       | Rollsback the datanode to the previous version. This should
+| -rollback       | Rollback the datanode to the previous version. This should
 |                 | be used after stopping the datanode and distributing the
 |                 | old hadoop version.
 *-----------------+-----------------------------------------------------------+
-| -rollingupgrade rollback | Rollsback a rolling upgrade operation.
+| -rollingupgrade rollback | Rollback a rolling upgrade operation.
 *-----------------+-----------------------------------------------------------+

 ** <<<dfsadmin>>>

   Runs a HDFS dfsadmin client.

-   Usage: <<<hdfs dfsadmin [GENERIC_OPTIONS]
+------------------------------------------+
+   Usage: hdfs dfsadmin [GENERIC_OPTIONS]
          [-report [-live] [-dead] [-decommissioning]]
          [-safemode enter | leave | get | wait]
          [-saveNamespace]
@ -169,6 +170,8 @@ HDFS Commands Guide
          [-clrQuota <dirname>...<dirname>]
          [-setSpaceQuota <quota> <dirname>...<dirname>]
          [-clrSpaceQuota <dirname>...<dirname>]
+          [-setStoragePolicy <path> <policyName>]
+          [-getStoragePolicy <path>]
          [-finalizeUpgrade]
          [-rollingUpgrade [<query>|<prepare>|<finalize>]]
          [-metasave filename]
@ -186,7 +189,8 @@ HDFS Commands Guide
          [-fetchImage <local directory>]
          [-shutdownDatanode <datanode_host:ipc_port> [upgrade]]
          [-getDatanodeInfo <datanode_host:ipc_port>]
-          [-help [cmd]]>>>
+          [-help [cmd]]
+------------------------------------------+

 *-----------------+-----------------------------------------------------------+
 || COMMAND_OPTION || Description
@ -236,6 +240,10 @@ HDFS Commands Guide
                  | {{{../hadoop-hdfs/HdfsQuotaAdminGuide.html#Administrative_Commands}HDFS Quotas Guide}}
                  | for the detail.
 *-----------------+-----------------------------------------------------------+
+| -setStoragePolicy \<path\> \<policyName\> | Set a storage policy to a file or a directory.
+*-----------------+-----------------------------------------------------------+
+| -getStoragePolicy \<path\> | Get the storage policy of a file or a directory.
+*-----------------+-----------------------------------------------------------+
 | -finalizeUpgrade| Finalize upgrade of HDFS. Datanodes delete their previous
                  | version working directories, followed by Namenode doing the
                  | same. This completes the upgrade process.
@ -250,7 +258,7 @@ HDFS Commands Guide
                  | <filename> will contain one line for each of the following\
                  | 1. Datanodes heart beating with Namenode\
                  | 2. Blocks waiting to be replicated\
-                  | 3. Blocks currrently being replicated\
+                  | 3. Blocks currently being replicated\
                  | 4. Blocks waiting to be deleted
 *-----------------+-----------------------------------------------------------+
 | -refreshServiceAcl | Reload the service-level authorization policy file.
@ -312,12 +320,30 @@ HDFS Commands Guide
                  | is specified.
 *-----------------+-----------------------------------------------------------+

+** <<<mover>>>
+
+   Runs the data migration utility.
+   See {{{./ArchivalStorage.html#Mover_-_A_New_Data_Migration_Tool}Mover}} for more details.
+
+   Usage: <<<hdfs mover [-p <files/dirs> | -f <local file name>]>>>
+
+*--------------------+--------------------------------------------------------+
+|| COMMAND_OPTION    || Description
+*--------------------+--------------------------------------------------------+
+| -p \<files/dirs\>  | Specify a space separated list of HDFS files/dirs to migrate.
+*--------------------+--------------------------------------------------------+
+| -f \<local file\>  | Specify a local file containing a list of HDFS files/dirs to migrate.
+*--------------------+--------------------------------------------------------+
+
+  Note that, when both -p and -f options are omitted, the default path is the root directory.
+
 ** <<<namenode>>>

   Runs the namenode. More info about the upgrade, rollback and finalize is at
   {{{./HdfsUserGuide.html#Upgrade_and_Rollback}Upgrade Rollback}}.

-   Usage: <<<hdfs namenode [-backup] |
+------------------------------------------+
+   Usage: hdfs namenode [-backup] |
          [-checkpoint] |
          [-format [-clusterid cid ] [-force] [-nonInteractive] ] |
          [-upgrade [-clusterid cid] [-renameReserved<k-v pairs>] ] |
@ -329,7 +355,8 @@ HDFS Commands Guide
          [-initializeSharedEdits] |
          [-bootstrapStandby] |
          [-recover [-force] ] |
-          [-metadataVersion ]>>>
+          [-metadataVersion ]
+------------------------------------------+

 *--------------------+--------------------------------------------------------+
 || COMMAND_OPTION    || Description
@ -351,7 +378,7 @@ HDFS Commands Guide
 | -upgradeOnly [-clusterid cid] [-renameReserved\<k-v pairs\>] | Upgrade the
                     | specified NameNode and then shutdown it.
 *--------------------+--------------------------------------------------------+
-| -rollback          | Rollsback the NameNode to the previous version. This
+| -rollback          | Rollback the NameNode to the previous version. This
                     | should be used after stopping the cluster and
                     | distributing the old Hadoop version.
 *--------------------+--------------------------------------------------------+
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java
@ -24,6 +24,8 @@ import com.google.common.base.Preconditions;
 import com.google.common.base.Supplier;
 import com.google.common.collect.Lists;

+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@ -66,6 +68,7 @@ import org.apache.hadoop.hdfs.server.namenode.ha
        .ConfiguredFailoverProxyProvider;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
+import org.apache.hadoop.hdfs.tools.DFSAdmin;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.nativeio.NativeIO;
 import org.apache.hadoop.net.NetUtils;
@ -75,6 +78,8 @@ import org.apache.hadoop.security.ShellBasedUnixGroupsMapping;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.VersionInfo;
 import org.junit.Assume;

@ -88,11 +93,8 @@ import java.security.PrivilegedExceptionAction;
 import java.util.*;
 import java.util.concurrent.TimeoutException;

-import static org.apache.hadoop.fs.CreateFlag.CREATE;
-import static org.apache.hadoop.fs.CreateFlag.LAZY_PERSIST;
-import static org.apache.hadoop.fs.CreateFlag.OVERWRITE;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY;
+import static org.apache.hadoop.fs.CreateFlag.*;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@ -891,6 +893,37 @@ public class DFSTestUtil {
    conf.set(DFSConfigKeys.DFS_NAMESERVICES, Joiner.on(",")
        .join(nameservices));
  }
+
+  public static void setFederatedHAConfiguration(MiniDFSCluster cluster,
+      Configuration conf) {
+    Map<String, List<String>> nameservices = Maps.newHashMap();
+    for (NameNodeInfo info : cluster.getNameNodeInfos()) {
+      Preconditions.checkState(info.nameserviceId != null);
+      List<String> nns = nameservices.get(info.nameserviceId);
+      if (nns == null) {
+        nns = Lists.newArrayList();
+        nameservices.put(info.nameserviceId, nns);
+      }
+      nns.add(info.nnId);
+
+      conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY,
+          info.nameserviceId, info.nnId),
+          DFSUtil.createUri(HdfsConstants.HDFS_URI_SCHEME,
+          info.nameNode.getNameNodeAddress()).toString());
+      conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY,
+          info.nameserviceId, info.nnId),
+          DFSUtil.createUri(HdfsConstants.HDFS_URI_SCHEME,
+          info.nameNode.getNameNodeAddress()).toString());
+    }
+    for (Map.Entry<String, List<String>> entry : nameservices.entrySet()) {
+      conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX,
+          entry.getKey()), Joiner.on(",").join(entry.getValue()));
+      conf.set(DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "." + entry
+          .getKey(), ConfiguredFailoverProxyProvider.class.getName());
+    }
+    conf.set(DFSConfigKeys.DFS_NAMESERVICES, Joiner.on(",")
+        .join(nameservices.keySet()));
+  }
  
  private static DatanodeID getDatanodeID(String ipAddr) {
    return new DatanodeID(ipAddr, "localhost",
@ -974,9 +1007,14 @@ public class DFSTestUtil {
  public static DatanodeStorageInfo[] createDatanodeStorageInfos(int n) {
    return createDatanodeStorageInfos(n, null, null);
  }
-    
+
  public static DatanodeStorageInfo[] createDatanodeStorageInfos(
      int n, String[] racks, String[] hostnames) {
+    return createDatanodeStorageInfos(n, racks, hostnames, null);
+  }
+
+  public static DatanodeStorageInfo[] createDatanodeStorageInfos(
+      int n, String[] racks, String[] hostnames, StorageType[] types) {
    DatanodeStorageInfo[] storages = new DatanodeStorageInfo[n];
    for(int i = storages.length; i > 0; ) {
      final String storageID = "s" + i;
@ -984,16 +1022,30 @@ public class DFSTestUtil {
      i--;
      final String rack = (racks!=null && i < racks.length)? racks[i]: "defaultRack";
      final String hostname = (hostnames!=null && i < hostnames.length)? hostnames[i]: "host";
-      storages[i] = createDatanodeStorageInfo(storageID, ip, rack, hostname);
+      final StorageType type = (types != null && i < types.length) ? types[i]
+          : StorageType.DEFAULT;
+      storages[i] = createDatanodeStorageInfo(storageID, ip, rack, hostname,
+          type);
    }
    return storages;
  }
+
  public static DatanodeStorageInfo createDatanodeStorageInfo(
      String storageID, String ip, String rack, String hostname) {
-    final DatanodeStorage storage = new DatanodeStorage(storageID);
-    final DatanodeDescriptor dn = BlockManagerTestUtil.getDatanodeDescriptor(ip, rack, storage, hostname);
+    return createDatanodeStorageInfo(storageID, ip, rack, hostname,
+        StorageType.DEFAULT);
+  }
+
+  public static DatanodeStorageInfo createDatanodeStorageInfo(
+      String storageID, String ip, String rack, String hostname,
+      StorageType type) {
+    final DatanodeStorage storage = new DatanodeStorage(storageID,
+        DatanodeStorage.State.NORMAL, type);
+    final DatanodeDescriptor dn = BlockManagerTestUtil.getDatanodeDescriptor(
+        ip, rack, storage, hostname);
    return BlockManagerTestUtil.newDatanodeStorageInfo(dn, storage);
  }
+
  public static DatanodeDescriptor[] toDatanodeDescriptor(
      DatanodeStorageInfo[] storages) {
    DatanodeDescriptor[] datanodes = new DatanodeDescriptor[storages.length];
@ -1080,6 +1132,8 @@ public class DFSTestUtil {
    FSDataOutputStream s = filesystem.create(pathFileCreate);
    // OP_CLOSE 9
    s.close();
+    // OP_SET_STORAGE_POLICY 45
+    filesystem.setStoragePolicy(pathFileCreate, "HOT");
    // OP_RENAME_OLD 1
    final Path pathFileMoved = new Path("/file_moved");
    filesystem.rename(pathFileCreate, pathFileMoved);
@ -1440,6 +1494,57 @@ public class DFSTestUtil {
    return expectedPrimary.getDatanodeDescriptor();
  }

+  public static void toolRun(Tool tool, String cmd, int retcode, String contain)
+      throws Exception {
+    String [] cmds = StringUtils.split(cmd, ' ');
+    System.out.flush();
+    System.err.flush();
+    PrintStream origOut = System.out;
+    PrintStream origErr = System.err;
+    String output = null;
+    int ret = 0;
+    try {
+      ByteArrayOutputStream bs = new ByteArrayOutputStream(1024);
+      PrintStream out = new PrintStream(bs);
+      System.setOut(out);
+      System.setErr(out);
+      ret = tool.run(cmds);
+      System.out.flush();
+      System.err.flush();
+      out.close();
+      output = bs.toString();
+    } finally {
+      System.setOut(origOut);
+      System.setErr(origErr);
+    }
+    System.out.println("Output for command: " + cmd + " retcode: " + ret);
+    if (output != null) {
+      System.out.println(output);
+    }
+    assertEquals(retcode, ret);
+    if (contain != null) {
+      assertTrue("The real output is: " + output + ".\n It should contain: "
+          + contain, output.contains(contain));
+    }
+  }
+
+  public static void FsShellRun(String cmd, int retcode, String contain,
+      Configuration conf) throws Exception {
+    FsShell shell = new FsShell(new Configuration(conf));
+    toolRun(shell, cmd, retcode, contain);
+  }  
+
+  public static void DFSAdminRun(String cmd, int retcode, String contain,
+      Configuration conf) throws Exception {
+    DFSAdmin admin = new DFSAdmin(new Configuration(conf));
+    toolRun(admin, cmd, retcode, contain);
+  }
+
+  public static void FsShellRun(String cmd, Configuration conf)
+      throws Exception {
+    FsShellRun(cmd, 0, null, conf);
+  }
+
  public static void addDataNodeLayoutVersion(final int lv, final String description)
      throws NoSuchFieldException, IllegalAccessException {
    Preconditions.checkState(lv < DataNodeLayoutVersion.CURRENT_LAYOUT_VERSION);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
@ -1475,19 +1475,21 @@ public class MiniDFSCluster {
          secureResources, dn.getIpcPort()));
      dns[i - curDatanodesNum] = dn;
    }
-    curDatanodesNum += numDataNodes;
    this.numDataNodes += numDataNodes;
    waitActive();
-
+    
    if (storageCapacities != null) {
-      for (int i = curDatanodesNumSaved; i < curDatanodesNumSaved+numDataNodes; ++i) {
-        List<? extends FsVolumeSpi> volumes = dns[i].getFSDataset().getVolumes();
-        assert storageCapacities[i].length == storagesPerDatanode;
+      for (int i = curDatanodesNum; i < curDatanodesNum+numDataNodes; ++i) {
+        final int index = i - curDatanodesNum;
+        List<? extends FsVolumeSpi> volumes = dns[index].getFSDataset().getVolumes();
+        assert storageCapacities[index].length == storagesPerDatanode;
        assert volumes.size() == storagesPerDatanode;

        for (int j = 0; j < volumes.size(); ++j) {
          FsVolumeImpl volume = (FsVolumeImpl) volumes.get(j);
-          volume.setCapacityForTesting(storageCapacities[i][j]);
+          LOG.info("setCapacityForTesting "  + storageCapacities[index][j]
+              + " for [" + volume.getStorageType() + "]" + volume.getStorageID());
+          volume.setCapacityForTesting(storageCapacities[index][j]);
        }
      }
    }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockStoragePolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestBlockStoragePolicy.java
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientRetries.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientRetries.java
@ -255,12 +255,12 @@ public class TestDFSClientRetries {
    Mockito.doReturn(
            new HdfsFileStatus(0, false, 1, 1024, false, 0, 0, new FsPermission(
                (short) 777), "owner", "group", new byte[0], new byte[0],
-                1010, 0, null)).when(mockNN).getFileInfo(anyString());
+                1010, 0, null, (byte) 0)).when(mockNN).getFileInfo(anyString());
    
    Mockito.doReturn(
            new HdfsFileStatus(0, false, 1, 1024, false, 0, 0, new FsPermission(
                (short) 777), "owner", "group", new byte[0], new byte[0],
-                1010, 0, null))
+                1010, 0, null, (byte) 0))
        .when(mockNN)
        .create(anyString(), (FsPermission) anyObject(), anyString(),
            (EnumSetWritable<CreateFlag>) anyObject(), anyBoolean(),
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSInotifyEventInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSInotifyEventInputStream.java
@ -64,7 +64,7 @@ public class TestDFSInotifyEventInputStream {
   */
  @Test
  public void testOpcodeCount() {
-    Assert.assertTrue(FSEditLogOpCodes.values().length == 46);
+    Assert.assertTrue(FSEditLogOpCodes.values().length == 47);
  }


--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java
@ -940,7 +940,7 @@ public class TestEncryptionZones {
    Future<?> future = executor.submit(new CreateFileTask(fsWrapper, file));

    // Flip-flop between two EZs to repeatedly fail
-    for (int i=0; i<10; i++) {
+    for (int i=0; i<DFSOutputStream.CREATE_RETRY_COUNT+1; i++) {
      injector.ready.await();
      fsWrapper.delete(zone1, true);
      fsWrapper.mkdir(zone1, FsPermission.getDirDefault(), true);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLease.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLease.java
@ -17,12 +17,14 @@
 */
 package org.apache.hadoop.hdfs;

+import static org.mockito.Matchers.anyBoolean;
 import static org.mockito.Matchers.anyList;
 import static org.mockito.Matchers.anyString;
 import static org.mockito.Matchers.anyShort;
 import static org.mockito.Matchers.anyLong;
-import static org.mockito.Matchers.anyBoolean;
 import static org.mockito.Matchers.anyObject;
+import static org.mockito.Matchers.anyShort;
+import static org.mockito.Matchers.anyString;
 import static org.mockito.Mockito.doNothing;
 import static org.mockito.Mockito.doThrow;
 import static org.mockito.Mockito.spy;
@ -38,7 +40,6 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.crypto.CipherSuite;
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Options;
 import org.apache.hadoop.fs.Path;
@ -342,12 +343,12 @@ public class TestLease {
    Mockito.doReturn(
        new HdfsFileStatus(0, false, 1, 1024, false, 0, 0, new FsPermission(
            (short) 777), "owner", "group", new byte[0], new byte[0],
-            1010, 0, null)).when(mcp).getFileInfo(anyString());
+            1010, 0, null, (byte) 0)).when(mcp).getFileInfo(anyString());
    Mockito
        .doReturn(
            new HdfsFileStatus(0, false, 1, 1024, false, 0, 0, new FsPermission(
                (short) 777), "owner", "group", new byte[0], new byte[0],
-                1010, 0, null))
+                1010, 0, null, (byte) 0))
        .when(mcp)
        .create(anyString(), (FsPermission) anyObject(), anyString(),
            (EnumSetWritable<CreateFlag>) anyObject(), anyBoolean(),
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSnapshotCommands.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSnapshotCommands.java
@ -18,22 +18,11 @@

 package org.apache.hadoop.hdfs;

-import static org.junit.Assert.*;
-
-import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.io.PrintStream;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FsShell;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hdfs.DistributedFileSystem;
-import org.apache.hadoop.hdfs.HdfsConfiguration;
-import org.apache.hadoop.hdfs.tools.DFSAdmin;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
@ -89,136 +78,87 @@ public class TestSnapshotCommands {
    }
  }

-  private void toolRun(Tool tool, String cmd, int retcode, String contain)
-      throws Exception {
-    String [] cmds = StringUtils.split(cmd, ' ');
-    System.out.flush();
-    System.err.flush();
-    PrintStream origOut = System.out;
-    PrintStream origErr = System.err;
-    String output = null;
-    int ret = 0;
-    try {
-      ByteArrayOutputStream bs = new ByteArrayOutputStream(1024);
-      PrintStream out = new PrintStream(bs);
-      System.setOut(out);
-      System.setErr(out);
-      ret = tool.run(cmds);
-      System.out.flush();
-      System.err.flush();
-      out.close();
-      output = bs.toString();
-    } finally {
-      System.setOut(origOut);
-      System.setErr(origErr);
-    }
-    System.out.println("Output for command: " + cmd + " retcode: " + ret);
-    if (output != null) {
-      System.out.println(output);
-    }
-    assertEquals(retcode, ret);
-    if (contain != null) {
-      assertTrue(output.contains(contain));
-    }
-  }
-
-  private void FsShellRun(String cmd, int retcode, String contain)
-      throws Exception {
-    FsShell shell = new FsShell(new Configuration(conf));
-    toolRun(shell, cmd, retcode, contain);
-  }  
-
-  private void DFSAdminRun(String cmd, int retcode, String contain)
-      throws Exception {
-    DFSAdmin admin = new DFSAdmin(new Configuration(conf));
-    toolRun(admin, cmd, retcode, contain);
-  }
-
-  private void FsShellRun(String cmd) throws Exception {
-    FsShellRun(cmd, 0, null);
-  }
-
  @Test
  public void testAllowSnapshot() throws Exception {
    // Idempotent test
-    DFSAdminRun("-allowSnapshot /sub1", 0, "Allowing snaphot on /sub1 succeeded");
+    DFSTestUtil.DFSAdminRun("-allowSnapshot /sub1", 0, "Allowing snaphot on /sub1 succeeded", conf);
    // allow normal dir success 
-    FsShellRun("-mkdir /sub2");
-    DFSAdminRun("-allowSnapshot /sub2", 0, "Allowing snaphot on /sub2 succeeded");
+    DFSTestUtil.FsShellRun("-mkdir /sub2", conf);
+    DFSTestUtil.DFSAdminRun("-allowSnapshot /sub2", 0, "Allowing snaphot on /sub2 succeeded", conf);
    // allow non-exists dir failed
-    DFSAdminRun("-allowSnapshot /sub3", -1, null);
+    DFSTestUtil.DFSAdminRun("-allowSnapshot /sub3", -1, null, conf);
  }

  @Test
  public void testCreateSnapshot() throws Exception {
    // test createSnapshot
-    FsShellRun("-createSnapshot /sub1 sn0", 0, "Created snapshot /sub1/.snapshot/sn0");
-    FsShellRun("-createSnapshot /sub1 sn0", 1, "there is already a snapshot with the same name \"sn0\"");
-    FsShellRun("-rmr /sub1/sub1sub2");
-    FsShellRun("-mkdir /sub1/sub1sub3");
-    FsShellRun("-createSnapshot /sub1 sn1", 0, "Created snapshot /sub1/.snapshot/sn1");
+    DFSTestUtil.FsShellRun("-createSnapshot /sub1 sn0", 0, "Created snapshot /sub1/.snapshot/sn0", conf);
+    DFSTestUtil.FsShellRun("-createSnapshot /sub1 sn0", 1, "there is already a snapshot with the same name \"sn0\"", conf);
+    DFSTestUtil.FsShellRun("-rmr /sub1/sub1sub2", conf);
+    DFSTestUtil.FsShellRun("-mkdir /sub1/sub1sub3", conf);
+    DFSTestUtil.FsShellRun("-createSnapshot /sub1 sn1", 0, "Created snapshot /sub1/.snapshot/sn1", conf);
    // check snapshot contents
-    FsShellRun("-ls /sub1", 0, "/sub1/sub1sub1");
-    FsShellRun("-ls /sub1", 0, "/sub1/sub1sub3");
-    FsShellRun("-ls /sub1/.snapshot", 0, "/sub1/.snapshot/sn0");
-    FsShellRun("-ls /sub1/.snapshot", 0, "/sub1/.snapshot/sn1");
-    FsShellRun("-ls /sub1/.snapshot/sn0", 0, "/sub1/.snapshot/sn0/sub1sub1");
-    FsShellRun("-ls /sub1/.snapshot/sn0", 0, "/sub1/.snapshot/sn0/sub1sub2");
-    FsShellRun("-ls /sub1/.snapshot/sn1", 0, "/sub1/.snapshot/sn1/sub1sub1");
-    FsShellRun("-ls /sub1/.snapshot/sn1", 0, "/sub1/.snapshot/sn1/sub1sub3");
+    DFSTestUtil.FsShellRun("-ls /sub1", 0, "/sub1/sub1sub1", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1", 0, "/sub1/sub1sub3", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot", 0, "/sub1/.snapshot/sn0", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot", 0, "/sub1/.snapshot/sn1", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot/sn0", 0, "/sub1/.snapshot/sn0/sub1sub1", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot/sn0", 0, "/sub1/.snapshot/sn0/sub1sub2", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot/sn1", 0, "/sub1/.snapshot/sn1/sub1sub1", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot/sn1", 0, "/sub1/.snapshot/sn1/sub1sub3", conf);
  }

  @Test
  public void testMkdirUsingReservedName() throws Exception {
    // test can not create dir with reserved name: .snapshot
-    FsShellRun("-ls /");
-    FsShellRun("-mkdir /.snapshot", 1, "File exists");
-    FsShellRun("-mkdir /sub1/.snapshot", 1, "File exists");
+    DFSTestUtil.FsShellRun("-ls /", conf);
+    DFSTestUtil.FsShellRun("-mkdir /.snapshot", 1, "File exists", conf);
+    DFSTestUtil.FsShellRun("-mkdir /sub1/.snapshot", 1, "File exists", conf);
    // mkdir -p ignore reserved name check if dir already exists
-    FsShellRun("-mkdir -p /sub1/.snapshot");
-    FsShellRun("-mkdir -p /sub1/sub1sub1/.snapshot", 1, "mkdir: \".snapshot\" is a reserved name.");
+    DFSTestUtil.FsShellRun("-mkdir -p /sub1/.snapshot", conf);
+    DFSTestUtil.FsShellRun("-mkdir -p /sub1/sub1sub1/.snapshot", 1, "mkdir: \".snapshot\" is a reserved name.", conf);
  }

  @Test
  public void testRenameSnapshot() throws Exception {
-    FsShellRun("-createSnapshot /sub1 sn.orig");
-    FsShellRun("-renameSnapshot /sub1 sn.orig sn.rename");
-    FsShellRun("-ls /sub1/.snapshot", 0, "/sub1/.snapshot/sn.rename");
-    FsShellRun("-ls /sub1/.snapshot/sn.rename", 0, "/sub1/.snapshot/sn.rename/sub1sub1");
-    FsShellRun("-ls /sub1/.snapshot/sn.rename", 0, "/sub1/.snapshot/sn.rename/sub1sub2");
+    DFSTestUtil.FsShellRun("-createSnapshot /sub1 sn.orig", conf);
+    DFSTestUtil.FsShellRun("-renameSnapshot /sub1 sn.orig sn.rename", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot", 0, "/sub1/.snapshot/sn.rename", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot/sn.rename", 0, "/sub1/.snapshot/sn.rename/sub1sub1", conf);
+    DFSTestUtil.FsShellRun("-ls /sub1/.snapshot/sn.rename", 0, "/sub1/.snapshot/sn.rename/sub1sub2", conf);

    //try renaming from a non-existing snapshot
-    FsShellRun("-renameSnapshot /sub1 sn.nonexist sn.rename", 1,
-        "renameSnapshot: The snapshot sn.nonexist does not exist for directory /sub1");
+    DFSTestUtil.FsShellRun("-renameSnapshot /sub1 sn.nonexist sn.rename", 1,
+        "renameSnapshot: The snapshot sn.nonexist does not exist for directory /sub1", conf);

    //try renaming to existing snapshots
-    FsShellRun("-createSnapshot /sub1 sn.new");
-    FsShellRun("-renameSnapshot /sub1 sn.new sn.rename", 1,
-        "renameSnapshot: The snapshot sn.rename already exists for directory /sub1");
-    FsShellRun("-renameSnapshot /sub1 sn.rename sn.new", 1,
-        "renameSnapshot: The snapshot sn.new already exists for directory /sub1");
+    DFSTestUtil.FsShellRun("-createSnapshot /sub1 sn.new", conf);
+    DFSTestUtil.FsShellRun("-renameSnapshot /sub1 sn.new sn.rename", 1,
+        "renameSnapshot: The snapshot sn.rename already exists for directory /sub1", conf);
+    DFSTestUtil.FsShellRun("-renameSnapshot /sub1 sn.rename sn.new", 1,
+        "renameSnapshot: The snapshot sn.new already exists for directory /sub1", conf);
  }

  @Test
  public void testDeleteSnapshot() throws Exception {
-    FsShellRun("-createSnapshot /sub1 sn1");
-    FsShellRun("-deleteSnapshot /sub1 sn1");
-    FsShellRun("-deleteSnapshot /sub1 sn1", 1,
-        "deleteSnapshot: Cannot delete snapshot sn1 from path /sub1: the snapshot does not exist.");
+    DFSTestUtil.FsShellRun("-createSnapshot /sub1 sn1", conf);
+    DFSTestUtil.FsShellRun("-deleteSnapshot /sub1 sn1", conf);
+    DFSTestUtil.FsShellRun("-deleteSnapshot /sub1 sn1", 1,
+        "deleteSnapshot: Cannot delete snapshot sn1 from path /sub1: the snapshot does not exist.", conf);
  }

  @Test
  public void testDisallowSnapshot() throws Exception {
-    FsShellRun("-createSnapshot /sub1 sn1");
+    DFSTestUtil.FsShellRun("-createSnapshot /sub1 sn1", conf);
    // cannot delete snapshotable dir
-    FsShellRun("-rmr /sub1", 1, "The directory /sub1 cannot be deleted since /sub1 is snapshottable and already has snapshots");
-    DFSAdminRun("-disallowSnapshot /sub1", -1,
-        "disallowSnapshot: The directory /sub1 has snapshot(s). Please redo the operation after removing all the snapshots.");
-    FsShellRun("-deleteSnapshot /sub1 sn1");
-    DFSAdminRun("-disallowSnapshot /sub1", 0, "Disallowing snaphot on /sub1 succeeded");
+    DFSTestUtil.FsShellRun("-rmr /sub1", 1, "The directory /sub1 cannot be deleted since /sub1 is snapshottable and already has snapshots", conf);
+    DFSTestUtil.DFSAdminRun("-disallowSnapshot /sub1", -1,
+        "disallowSnapshot: The directory /sub1 has snapshot(s). Please redo the operation after removing all the snapshots.", conf);
+    DFSTestUtil.FsShellRun("-deleteSnapshot /sub1 sn1", conf);
+    DFSTestUtil.DFSAdminRun("-disallowSnapshot /sub1", 0, "Disallowing snaphot on /sub1 succeeded", conf);
    // Idempotent test
-    DFSAdminRun("-disallowSnapshot /sub1", 0, "Disallowing snaphot on /sub1 succeeded");
+    DFSTestUtil.DFSAdminRun("-disallowSnapshot /sub1", 0, "Disallowing snaphot on /sub1 succeeded", conf);
    // now it can be deleted
-    FsShellRun("-rmr /sub1");
+    DFSTestUtil.FsShellRun("-rmr /sub1", conf);
  }
 }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestStoragePolicyCommands.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestStoragePolicyCommands.java
@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test storage policy related DFSAdmin commands
+ */
+public class TestStoragePolicyCommands {
+  private static final short REPL = 1;
+  private static final int SIZE = 128;
+
+  private static Configuration conf;
+  private static MiniDFSCluster cluster;
+  private static DistributedFileSystem fs;
+  
+  @Before
+  public void clusterSetUp() throws IOException {
+    conf = new HdfsConfiguration();
+    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(REPL).build();
+    cluster.waitActive();
+    fs = cluster.getFileSystem();
+  }
+
+  @After
+  public void clusterShutdown() throws IOException{
+    if(fs != null){
+      fs.close();
+    }
+    if(cluster != null){
+      cluster.shutdown();
+    }
+  }
+
+  @Test
+  public void testSetAndGetStoragePolicy() throws Exception {
+    final Path foo = new Path("/foo");
+    final Path bar = new Path(foo, "bar");
+    DFSTestUtil.createFile(fs, bar, SIZE, REPL, 0);
+
+    DFSTestUtil.DFSAdminRun("-setStoragePolicy /foo WARM", 0,
+        "Set storage policy WARM on " + foo.toString(), conf);
+    DFSTestUtil.DFSAdminRun("-setStoragePolicy /foo/bar COLD", 0,
+        "Set storage policy COLD on " + bar.toString(), conf);
+    DFSTestUtil.DFSAdminRun("-setStoragePolicy /fooz WARM", -1,
+        "File/Directory does not exist: /fooz", conf);
+
+    final BlockStoragePolicy.Suite suite = BlockStoragePolicy
+        .readBlockStorageSuite(conf);
+    final BlockStoragePolicy warm = suite.getPolicy("WARM");
+    final BlockStoragePolicy cold = suite.getPolicy("COLD");
+    DFSTestUtil.DFSAdminRun("-getStoragePolicy /foo", 0,
+        "The storage policy of " + foo.toString() + ":\n" + warm, conf);
+    DFSTestUtil.DFSAdminRun("-getStoragePolicy /foo/bar", 0,
+        "The storage policy of " + bar.toString() + ":\n" + cold, conf);
+    DFSTestUtil.DFSAdminRun("-getStoragePolicy /fooz", -1,
+        "File/Directory does not exist: /fooz", conf);
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java
@ -89,7 +89,14 @@ public class TestBalancer {
  private static final Random r = new Random();

  static {
+    initTestSetup();
+  }
+
+  public static void initTestSetup() {
    Dispatcher.setBlockMoveWaitTime(1000L) ;
+
+    // do not create id file since it occupies the disk space
+    NameNodeConnector.setWrite2IdFile(false);
  }

  static void initConf(Configuration conf) {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java
@ -44,7 +44,7 @@ public class TestBalancerWithHANameNodes {
  ClientProtocol client;

  static {
-    Dispatcher.setBlockMoveWaitTime(1000L);
+    TestBalancer.initTestSetup();
  }

  /**
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java
@ -73,7 +73,7 @@ public class TestBalancerWithMultipleNameNodes {
  private static final Random RANDOM = new Random();

  static {
-    Dispatcher.setBlockMoveWaitTime(1000L) ;
+    TestBalancer.initTestSetup();
  }

  /** Common objects used in various methods. */
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java
@ -75,7 +75,7 @@ public class TestBalancerWithNodeGroup {
  static final int DEFAULT_BLOCK_SIZE = 100;

  static {
-    Dispatcher.setBlockMoveWaitTime(1000L) ;
+    TestBalancer.initTestSetup();
  }

  static Configuration createConf() {
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
@ -40,6 +40,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.StorageType;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
@ -52,6 +53,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.net.NetworkTopology;
+import org.junit.Assert;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.junit.Before;
 import org.junit.Test;
@ -607,7 +609,6 @@ public class TestBlockManager {
    assertEquals(1, ds.getBlockReportCount());
  }
  
-  
  /**
   * Tests that a namenode doesn't choose a datanode with full disks to 
   * store blocks.
@ -654,5 +655,20 @@ public class TestBlockManager {
      cluster.shutdown();
    }
  }
-}

+  @Test
+  public void testUseDelHint() {
+    DatanodeStorageInfo delHint = new DatanodeStorageInfo(
+        DFSTestUtil.getLocalDatanodeDescriptor(), new DatanodeStorage("id"));
+    List<DatanodeStorageInfo> moreThan1Racks = Arrays.asList(delHint);
+    List<StorageType> excessTypes = new ArrayList<StorageType>();
+
+    excessTypes.add(StorageType.DEFAULT);
+    Assert.assertTrue(BlockManager.useDelHint(true, delHint, null,
+        moreThan1Racks, excessTypes));
+    excessTypes.remove(0);
+    excessTypes.add(StorageType.SSD);
+    Assert.assertFalse(BlockManager.useDelHint(true, delHint, null,
+        moreThan1Racks, excessTypes));
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicy.java
@ -20,6 +20,7 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 import static org.mockito.Matchers.any;
 import static org.mockito.Mockito.mock;
@ -47,13 +48,14 @@ import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.LogVerificationAppender;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.StorageType;
+import org.apache.hadoop.hdfs.TestBlockStoragePolicy;
 import org.apache.hadoop.hdfs.protocol.Block;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.StatefulBlockInfo;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
 import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
-import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.StatefulBlockInfo;
 import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.namenode.Namesystem;
@ -228,7 +230,7 @@ public class TestReplicationPolicy {
      List<DatanodeStorageInfo> chosenNodes,
      Set<Node> excludedNodes) {
    return replicator.chooseTarget(filename, numOfReplicas, writer, chosenNodes,
-        false, excludedNodes, BLOCK_SIZE, StorageType.DEFAULT);
+        false, excludedNodes, BLOCK_SIZE, TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);
  }

  /**
@ -295,7 +297,7 @@ public class TestReplicationPolicy {
    excludedNodes.add(dataNodes[1]); 
    chosenNodes.add(storages[2]);
    targets = replicator.chooseTarget(filename, 1, dataNodes[0], chosenNodes, true,
-        excludedNodes, BLOCK_SIZE, StorageType.DEFAULT);
+        excludedNodes, BLOCK_SIZE, TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);
    System.out.println("targets=" + Arrays.asList(targets));
    assertEquals(2, targets.length);
    //make sure that the chosen node is in the target.
@ -630,7 +632,7 @@ public class TestReplicationPolicy {
          .getNamesystem().getBlockManager().getBlockPlacementPolicy();
      DatanodeStorageInfo[] targets = replicator.chooseTarget(filename, 3,
          staleNodeInfo, new ArrayList<DatanodeStorageInfo>(), false, null,
-          BLOCK_SIZE, StorageType.DEFAULT);
+          BLOCK_SIZE, TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);

      assertEquals(targets.length, 3);
      assertFalse(isOnSameRack(targets[0], staleNodeInfo));
@ -656,7 +658,7 @@ public class TestReplicationPolicy {
      // Call chooseTarget
      targets = replicator.chooseTarget(filename, 3, staleNodeInfo,
          new ArrayList<DatanodeStorageInfo>(), false, null, BLOCK_SIZE,
-          StorageType.DEFAULT);
+          TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);
      assertEquals(targets.length, 3);
      assertTrue(isOnSameRack(targets[0], staleNodeInfo));
      
@ -933,8 +935,16 @@ public class TestReplicationPolicy {
    // replica nodes, while storages[2] and dataNodes[5] are in second set.
    assertEquals(2, first.size());
    assertEquals(2, second.size());
+    List<StorageType> excessTypes = new ArrayList<StorageType>();
+    {
+      // test returning null
+      excessTypes.add(StorageType.SSD);
+      assertNull(replicator.chooseReplicaToDelete(
+          null, null, (short)3, first, second, excessTypes));
+    }
+    excessTypes.add(StorageType.DEFAULT);
    DatanodeStorageInfo chosen = replicator.chooseReplicaToDelete(
-        null, null, (short)3, first, second);
+        null, null, (short)3, first, second, excessTypes);
    // Within first set, storages[1] with less free space
    assertEquals(chosen, storages[1]);

@ -942,11 +952,12 @@ public class TestReplicationPolicy {
    assertEquals(0, first.size());
    assertEquals(3, second.size());
    // Within second set, storages[5] with less free space
+    excessTypes.add(StorageType.DEFAULT);
    chosen = replicator.chooseReplicaToDelete(
-        null, null, (short)2, first, second);
+        null, null, (short)2, first, second, excessTypes);
    assertEquals(chosen, storages[5]);
  }
-  
+
  /**
   * This testcase tests whether the default value returned by
   * DFSUtil.getInvalidateWorkPctPerIteration() is positive, 
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicyConsiderLoad.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicyConsiderLoad.java
@ -17,25 +17,8 @@
 */
 package org.apache.hadoop.hdfs.server.blockmanagement;

-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.hdfs.DFSConfigKeys;
-import org.apache.hadoop.hdfs.DFSTestUtil;
-import org.apache.hadoop.hdfs.HdfsConfiguration;
-import org.apache.hadoop.hdfs.StorageType;
-import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
-import org.apache.hadoop.hdfs.protocol.HdfsConstants;
-import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
-import org.apache.hadoop.hdfs.server.common.StorageInfo;
-import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NodeType;
-import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
-import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
-import org.apache.hadoop.test.PathUtils;
-import org.apache.hadoop.util.VersionInfo;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;

 import java.io.File;
 import java.io.IOException;
@ -45,8 +28,25 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.TestBlockStoragePolicy;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NodeType;
+import org.apache.hadoop.hdfs.server.common.StorageInfo;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.test.PathUtils;
+import org.apache.hadoop.util.VersionInfo;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;

 public class TestReplicationPolicyConsiderLoad {

@ -146,7 +146,7 @@ public class TestReplicationPolicyConsiderLoad {
      DatanodeStorageInfo[] targets = namenode.getNamesystem().getBlockManager()
          .getBlockPlacementPolicy().chooseTarget("testFile.txt", 3,
              dataNodes[0], new ArrayList<DatanodeStorageInfo>(), false, null,
-              1024, StorageType.DEFAULT);
+              1024, TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);

      assertEquals(3, targets.length);
      Set<DatanodeStorageInfo> targetSet = new HashSet<DatanodeStorageInfo>(
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicyWithNodeGroup.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestReplicationPolicyWithNodeGroup.java
@ -37,6 +37,7 @@ import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.StorageType;
+import org.apache.hadoop.hdfs.TestBlockStoragePolicy;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.net.NetworkTopology;
@ -258,7 +259,7 @@ public class TestReplicationPolicyWithNodeGroup {
      List<DatanodeStorageInfo> chosenNodes,
      Set<Node> excludedNodes) {
    return replicator.chooseTarget(filename, numOfReplicas, writer, chosenNodes,
-        false, excludedNodes, BLOCK_SIZE, StorageType.DEFAULT);
+        false, excludedNodes, BLOCK_SIZE, TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);
  }

  /**
@ -340,7 +341,7 @@ public class TestReplicationPolicyWithNodeGroup {
    Set<Node> excludedNodes = new HashSet<Node>();
    excludedNodes.add(dataNodes[1]); 
    targets = repl.chooseTarget(filename, 4, dataNodes[0], chosenNodes, false, 
-        excludedNodes, BLOCK_SIZE, StorageType.DEFAULT);
+        excludedNodes, BLOCK_SIZE, TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);
    assertEquals(targets.length, 4);
    assertEquals(storages[0], targets[0]);

@ -358,7 +359,7 @@ public class TestReplicationPolicyWithNodeGroup {
    excludedNodes.add(dataNodes[1]); 
    chosenNodes.add(storages[2]);
    targets = repl.chooseTarget(filename, 1, dataNodes[0], chosenNodes, true,
-        excludedNodes, BLOCK_SIZE, StorageType.DEFAULT);
+        excludedNodes, BLOCK_SIZE, TestBlockStoragePolicy.DEFAULT_STORAGE_POLICY);
    System.out.println("targets=" + Arrays.asList(targets));
    assertEquals(2, targets.length);
    //make sure that the chosen node is in the target.
@ -612,8 +613,10 @@ public class TestReplicationPolicyWithNodeGroup {
        replicaList, rackMap, first, second);
    assertEquals(3, first.size());
    assertEquals(1, second.size());
+    List<StorageType> excessTypes = new ArrayList<StorageType>();
+    excessTypes.add(StorageType.DEFAULT);
    DatanodeStorageInfo chosen = replicator.chooseReplicaToDelete(
-        null, null, (short)3, first, second);
+        null, null, (short)3, first, second, excessTypes);
    // Within first set {dataNodes[0], dataNodes[1], dataNodes[2]}, 
    // dataNodes[0] and dataNodes[1] are in the same nodegroup, 
    // but dataNodes[1] is chosen as less free space
@ -624,16 +627,18 @@ public class TestReplicationPolicyWithNodeGroup {
    assertEquals(1, second.size());
    // Within first set {dataNodes[0], dataNodes[2]}, dataNodes[2] is chosen
    // as less free space
+    excessTypes.add(StorageType.DEFAULT);
    chosen = replicator.chooseReplicaToDelete(
-        null, null, (short)2, first, second);
+        null, null, (short)2, first, second, excessTypes);
    assertEquals(chosen, storages[2]);

    replicator.adjustSetsWithChosenReplica(rackMap, first, second, chosen);
    assertEquals(0, first.size());
    assertEquals(2, second.size());
    // Within second set, dataNodes[5] with less free space
+    excessTypes.add(StorageType.DEFAULT);
    chosen = replicator.chooseReplicaToDelete(
-        null, null, (short)1, first, second);
+        null, null, (short)1, first, second, excessTypes);
    assertEquals(chosen, storages[5]);
  }
  
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java
@ -1099,7 +1099,8 @@ public class SimulatedFSDataset implements FsDatasetSpi<FsVolumeSpi> {
  }

  @Override
-  public void addVolumes(Collection<StorageLocation> volumes) {
+  public List<StorageLocation> addVolumes(List<StorageLocation> volumes,
+      final Collection<String> bpids) {
    throw new UnsupportedOperationException();
  }

--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java
@ -0,0 +1,423 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.datanode;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.ReconfigurationException;
+import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.BlockMissingException;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.After;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class TestDataNodeHotSwapVolumes {
+  private static final int BLOCK_SIZE = 512;
+  private MiniDFSCluster cluster;
+
+  @After
+  public void tearDown() {
+    shutdown();
+  }
+
+  private void startDFSCluster(int numNameNodes, int numDataNodes)
+      throws IOException {
+    shutdown();
+    Configuration conf = new Configuration();
+    conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+
+    /*
+     * Lower the DN heartbeat, DF rate, and recheck interval to one second
+     * so state about failures and datanode death propagates faster.
+     */
+    conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
+    conf.setInt(DFSConfigKeys.DFS_DF_INTERVAL_KEY, 1000);
+    conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
+        1000);
+
+    MiniDFSNNTopology nnTopology =
+        MiniDFSNNTopology.simpleFederatedTopology(numNameNodes);
+
+    cluster = new MiniDFSCluster.Builder(conf)
+        .nnTopology(nnTopology)
+        .numDataNodes(numDataNodes)
+        .build();
+    cluster.waitActive();
+  }
+
+  private void shutdown() {
+    if (cluster != null) {
+      cluster.shutdown();
+      cluster = null;
+    }
+  }
+
+  private void createFile(Path path, int numBlocks)
+      throws IOException, InterruptedException, TimeoutException {
+    final short replicateFactor = 1;
+    createFile(path, numBlocks, replicateFactor);
+  }
+
+  private void createFile(Path path, int numBlocks, short replicateFactor)
+      throws IOException, InterruptedException, TimeoutException {
+    createFile(0, path, numBlocks, replicateFactor);
+  }
+
+  private void createFile(int fsIdx, Path path, int numBlocks)
+      throws IOException, InterruptedException, TimeoutException {
+    final short replicateFactor = 1;
+    createFile(fsIdx, path, numBlocks, replicateFactor);
+  }
+
+  private void createFile(int fsIdx, Path path, int numBlocks,
+      short replicateFactor)
+      throws IOException, TimeoutException, InterruptedException {
+    final int seed = 0;
+    final DistributedFileSystem fs = cluster.getFileSystem(fsIdx);
+    DFSTestUtil.createFile(fs, path, BLOCK_SIZE * numBlocks,
+        replicateFactor, seed);
+    DFSTestUtil.waitReplication(fs, path, replicateFactor);
+  }
+
+  /**
+   * Verify whether a file has enough content.
+   */
+  private static void verifyFileLength(FileSystem fs, Path path, int numBlocks)
+      throws IOException {
+    FileStatus status = fs.getFileStatus(path);
+    assertEquals(numBlocks * BLOCK_SIZE, status.getLen());
+  }
+
+  /** Return the number of replicas for a given block in the file. */
+  private static int getNumReplicas(FileSystem fs, Path file,
+      int blockIdx) throws IOException {
+    BlockLocation locs[] = fs.getFileBlockLocations(file, 0, Long.MAX_VALUE);
+    return locs.length < blockIdx + 1 ? 0 : locs[blockIdx].getNames().length;
+  }
+
+  /**
+   * Wait the block to have the exact number of replicas as expected.
+   */
+  private static void waitReplication(FileSystem fs, Path file, int blockIdx,
+      int numReplicas)
+      throws IOException, TimeoutException, InterruptedException {
+    int attempts = 50;  // Wait 5 seconds.
+    while (attempts > 0) {
+      if (getNumReplicas(fs, file, blockIdx) == numReplicas) {
+        return;
+      }
+      Thread.sleep(100);
+      attempts--;
+    }
+    throw new TimeoutException("Timed out waiting the " + blockIdx + "-th block"
+        + " of " + file + " to have " + numReplicas + " replicas.");
+  }
+
+  /** Parses data dirs from DataNode's configuration. */
+  private static Collection<String> getDataDirs(DataNode datanode) {
+    return datanode.getConf().getTrimmedStringCollection(
+        DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY);
+  }
+
+  @Test
+  public void testParseChangedVolumes() throws IOException {
+    startDFSCluster(1, 1);
+    DataNode dn = cluster.getDataNodes().get(0);
+    Configuration conf = dn.getConf();
+
+    String oldPaths = conf.get(DFS_DATANODE_DATA_DIR_KEY);
+    List<StorageLocation> oldLocations = new ArrayList<StorageLocation>();
+    for (String path : oldPaths.split(",")) {
+      oldLocations.add(StorageLocation.parse(path));
+    }
+    assertFalse(oldLocations.isEmpty());
+
+    String newPaths = "/foo/path1,/foo/path2";
+    conf.set(DFS_DATANODE_DATA_DIR_KEY, newPaths);
+
+    DataNode.ChangedVolumes changedVolumes =dn.parseChangedVolumes();
+    List<StorageLocation> newVolumes = changedVolumes.newLocations;
+    assertEquals(2, newVolumes.size());
+    assertEquals("/foo/path1", newVolumes.get(0).getFile().getAbsolutePath());
+    assertEquals("/foo/path2", newVolumes.get(1).getFile().getAbsolutePath());
+
+    List<StorageLocation> removedVolumes = changedVolumes.deactivateLocations;
+    assertEquals(oldLocations.size(), removedVolumes.size());
+    for (int i = 0; i < removedVolumes.size(); i++) {
+      assertEquals(oldLocations.get(i).getFile(),
+          removedVolumes.get(i).getFile());
+    }
+  }
+
+  @Test
+  public void testParseChangedVolumesFailures() throws IOException {
+    startDFSCluster(1, 1);
+    DataNode dn = cluster.getDataNodes().get(0);
+    Configuration conf = dn.getConf();
+    try {
+      conf.set(DFS_DATANODE_DATA_DIR_KEY, "");
+      dn.parseChangedVolumes();
+      fail("Should throw IOException: empty inputs.");
+    } catch (IOException e) {
+      GenericTestUtils.assertExceptionContains("No directory is specified.", e);
+    }
+  }
+
+  /** Add volumes to the first DataNode. */
+  private void addVolumes(int numNewVolumes) throws ReconfigurationException {
+    File dataDir = new File(cluster.getDataDirectory());
+    DataNode dn = cluster.getDataNodes().get(0);  // First DataNode.
+    Configuration conf = dn.getConf();
+    String oldDataDir = conf.get(DFS_DATANODE_DATA_DIR_KEY);
+
+    List<File> newVolumeDirs = new ArrayList<File>();
+    StringBuilder newDataDirBuf = new StringBuilder(oldDataDir);
+    int startIdx = oldDataDir.split(",").length + 1;
+    // Find the first available (non-taken) directory name for data volume.
+    while (true) {
+      File volumeDir = new File(dataDir, "data" + startIdx);
+      if (!volumeDir.exists()) {
+        break;
+      }
+      startIdx++;
+    }
+    for (int i = startIdx; i < startIdx + numNewVolumes; i++) {
+      File volumeDir = new File(dataDir, "data" + String.valueOf(i));
+      newVolumeDirs.add(volumeDir);
+      volumeDir.mkdirs();
+      newDataDirBuf.append(",");
+      newDataDirBuf.append(volumeDir.toURI());
+    }
+
+    String newDataDir = newDataDirBuf.toString();
+    dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, newDataDir);
+    assertEquals(newDataDir, conf.get(DFS_DATANODE_DATA_DIR_KEY));
+
+    // Check that all newly created volumes are appropriately formatted.
+    for (File volumeDir : newVolumeDirs) {
+      File curDir = new File(volumeDir, "current");
+      assertTrue(curDir.exists());
+      assertTrue(curDir.isDirectory());
+    }
+  }
+
+  private List<List<Integer>> getNumBlocksReport(int namesystemIdx) {
+    List<List<Integer>> results = new ArrayList<List<Integer>>();
+    final String bpid = cluster.getNamesystem(namesystemIdx).getBlockPoolId();
+    List<Map<DatanodeStorage, BlockListAsLongs>> blockReports =
+        cluster.getAllBlockReports(bpid);
+    for (Map<DatanodeStorage, BlockListAsLongs> datanodeReport : blockReports) {
+      List<Integer> numBlocksPerDN = new ArrayList<Integer>();
+      for (BlockListAsLongs blocks : datanodeReport.values()) {
+        numBlocksPerDN.add(blocks.getNumberOfBlocks());
+      }
+      results.add(numBlocksPerDN);
+    }
+    return results;
+  }
+
+  /**
+   * Test adding one volume on a running MiniDFSCluster with only one NameNode.
+   */
+  @Test
+  public void testAddOneNewVolume()
+      throws IOException, ReconfigurationException,
+      InterruptedException, TimeoutException {
+    startDFSCluster(1, 1);
+    String bpid = cluster.getNamesystem().getBlockPoolId();
+    final int numBlocks = 10;
+
+    addVolumes(1);
+
+    Path testFile = new Path("/test");
+    createFile(testFile, numBlocks);
+
+    List<Map<DatanodeStorage, BlockListAsLongs>> blockReports =
+        cluster.getAllBlockReports(bpid);
+    assertEquals(1, blockReports.size());  // 1 DataNode
+    assertEquals(3, blockReports.get(0).size());  // 3 volumes
+
+    // FSVolumeList uses Round-Robin block chooser by default. Thus the new
+    // blocks should be evenly located in all volumes.
+    int minNumBlocks = Integer.MAX_VALUE;
+    int maxNumBlocks = Integer.MIN_VALUE;
+    for (BlockListAsLongs blockList : blockReports.get(0).values()) {
+      minNumBlocks = Math.min(minNumBlocks, blockList.getNumberOfBlocks());
+      maxNumBlocks = Math.max(maxNumBlocks, blockList.getNumberOfBlocks());
+    }
+    assertTrue(Math.abs(maxNumBlocks - maxNumBlocks) <= 1);
+    verifyFileLength(cluster.getFileSystem(), testFile, numBlocks);
+  }
+
+  @Test(timeout = 60000)
+  public void testAddVolumesDuringWrite()
+      throws IOException, InterruptedException, TimeoutException,
+      ReconfigurationException {
+    startDFSCluster(1, 1);
+    String bpid = cluster.getNamesystem().getBlockPoolId();
+    Path testFile = new Path("/test");
+    createFile(testFile, 4);  // Each volume has 2 blocks.
+
+    addVolumes(2);
+
+    // Continue to write the same file, thus the new volumes will have blocks.
+    DFSTestUtil.appendFile(cluster.getFileSystem(), testFile, BLOCK_SIZE * 8);
+    verifyFileLength(cluster.getFileSystem(), testFile, 8 + 4);
+    // After appending data, there should be [2, 2, 4, 4] blocks in each volume
+    // respectively.
+    List<Integer> expectedNumBlocks = Arrays.asList(2, 2, 4, 4);
+
+    List<Map<DatanodeStorage, BlockListAsLongs>> blockReports =
+        cluster.getAllBlockReports(bpid);
+    assertEquals(1, blockReports.size());  // 1 DataNode
+    assertEquals(4, blockReports.get(0).size());  // 4 volumes
+    Map<DatanodeStorage, BlockListAsLongs> dnReport =
+        blockReports.get(0);
+    List<Integer> actualNumBlocks = new ArrayList<Integer>();
+    for (BlockListAsLongs blockList : dnReport.values()) {
+      actualNumBlocks.add(blockList.getNumberOfBlocks());
+    }
+    Collections.sort(actualNumBlocks);
+    assertEquals(expectedNumBlocks, actualNumBlocks);
+  }
+
+  @Test
+  public void testAddVolumesToFederationNN()
+      throws IOException, TimeoutException, InterruptedException,
+      ReconfigurationException {
+    // Starts a Cluster with 2 NameNode and 3 DataNodes. Each DataNode has 2
+    // volumes.
+    final int numNameNodes = 2;
+    final int numDataNodes = 1;
+    startDFSCluster(numNameNodes, numDataNodes);
+    Path testFile = new Path("/test");
+    // Create a file on the first namespace with 4 blocks.
+    createFile(0, testFile, 4);
+    // Create a file on the second namespace with 4 blocks.
+    createFile(1, testFile, 4);
+
+    // Add 2 volumes to the first DataNode.
+    final int numNewVolumes = 2;
+    addVolumes(numNewVolumes);
+
+    // Append to the file on the first namespace.
+    DFSTestUtil.appendFile(cluster.getFileSystem(0), testFile, BLOCK_SIZE * 8);
+
+    List<List<Integer>> actualNumBlocks = getNumBlocksReport(0);
+    assertEquals(cluster.getDataNodes().size(), actualNumBlocks.size());
+    List<Integer> blocksOnFirstDN = actualNumBlocks.get(0);
+    Collections.sort(blocksOnFirstDN);
+    assertEquals(Arrays.asList(2, 2, 4, 4), blocksOnFirstDN);
+
+    // Verify the second namespace also has the new volumes and they are empty.
+    actualNumBlocks = getNumBlocksReport(1);
+    assertEquals(4, actualNumBlocks.get(0).size());
+    assertEquals(numNewVolumes,
+        Collections.frequency(actualNumBlocks.get(0), 0));
+  }
+
+  @Test
+  public void testRemoveOneVolume()
+      throws ReconfigurationException, InterruptedException, TimeoutException,
+      IOException {
+    startDFSCluster(1, 1);
+    final short replFactor = 1;
+    Path testFile = new Path("/test");
+    createFile(testFile, 10, replFactor);
+
+    DataNode dn = cluster.getDataNodes().get(0);
+    Collection<String> oldDirs = getDataDirs(dn);
+    String newDirs = oldDirs.iterator().next();  // Keep the first volume.
+    dn.reconfigurePropertyImpl(
+        DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY, newDirs);
+    dn.scheduleAllBlockReport(0);
+
+    try {
+      DFSTestUtil.readFile(cluster.getFileSystem(), testFile);
+      fail("Expect to throw BlockMissingException.");
+    } catch (BlockMissingException e) {
+      GenericTestUtils.assertExceptionContains("Could not obtain block", e);
+    }
+
+    Path newFile = new Path("/newFile");
+    createFile(newFile, 6);
+
+    String bpid = cluster.getNamesystem().getBlockPoolId();
+    List<Map<DatanodeStorage, BlockListAsLongs>> blockReports =
+        cluster.getAllBlockReports(bpid);
+    assertEquals((int)replFactor, blockReports.size());
+
+    BlockListAsLongs blocksForVolume1 =
+        blockReports.get(0).values().iterator().next();
+    // The first volume has half of the testFile and full of newFile.
+    assertEquals(10 / 2 + 6, blocksForVolume1.getNumberOfBlocks());
+  }
+
+  @Test
+  public void testReplicatingAfterRemoveVolume()
+      throws InterruptedException, TimeoutException, IOException,
+      ReconfigurationException {
+    startDFSCluster(1, 2);
+    final DistributedFileSystem fs = cluster.getFileSystem();
+    final short replFactor = 2;
+    Path testFile = new Path("/test");
+    createFile(testFile, 4, replFactor);
+
+    DataNode dn = cluster.getDataNodes().get(0);
+    Collection<String> oldDirs = getDataDirs(dn);
+    String newDirs = oldDirs.iterator().next();  // Keep the first volume.
+    dn.reconfigurePropertyImpl(
+        DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY, newDirs);
+
+    // Force DataNode to report missing blocks.
+    dn.scheduleAllBlockReport(0);
+    DataNodeTestUtils.triggerDeletionReport(dn);
+
+    // The 2nd block only has 1 replica due to the removed data volume.
+    waitReplication(fs, testFile, 1, 1);
+
+    // Wait NameNode to replica missing blocks.
+    DFSTestUtil.waitReplication(fs, testFile, replFactor);
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java
@ -40,7 +40,10 @@ import org.mockito.Mockito;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
@ -117,6 +120,7 @@ public class TestFsDatasetImpl {
    final int numExistingVolumes = dataset.getVolumes().size();
    final int totalVolumes = numNewVolumes + numExistingVolumes;
    List<StorageLocation> newLocations = new ArrayList<StorageLocation>();
+    Set<String> expectedVolumes = new HashSet<String>();
    for (int i = 0; i < numNewVolumes; i++) {
      String path = BASE_DIR + "/newData" + i;
      newLocations.add(StorageLocation.parse(path));
@ -125,13 +129,15 @@ public class TestFsDatasetImpl {
    }
    when(storage.getNumStorageDirs()).thenReturn(totalVolumes);

-    dataset.addVolumes(newLocations);
+    dataset.addVolumes(newLocations, Arrays.asList(BLOCK_POOL_IDS));
    assertEquals(totalVolumes, dataset.getVolumes().size());
    assertEquals(totalVolumes, dataset.storageMap.size());
+
+    Set<String> actualVolumes = new HashSet<String>();
    for (int i = 0; i < numNewVolumes; i++) {
-      assertEquals(newLocations.get(i).getFile().getPath(),
-          dataset.getVolumes().get(numExistingVolumes + i).getBasePath());
+      dataset.getVolumes().get(numExistingVolumes + i).getBasePath();
    }
+    assertEquals(actualVolumes, expectedVolumes);
  }

  @Test
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java
@ -0,0 +1,222 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.mover;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.*;
+import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DBlock;
+import org.apache.hadoop.hdfs.server.balancer.NameNodeConnector;
+import org.apache.hadoop.hdfs.server.mover.Mover.MLocation;
+import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMover {
+  static Mover newMover(Configuration conf) throws IOException {
+    final Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+    Assert.assertEquals(1, namenodes.size());
+
+    final List<NameNodeConnector> nncs = NameNodeConnector.newNameNodeConnectors(
+        namenodes, Mover.class.getSimpleName(), Mover.MOVER_ID_PATH, conf);
+    return new Mover(nncs.get(0), conf);
+  }
+
+  @Test
+  public void testScheduleSameBlock() throws IOException {
+    final Configuration conf = new HdfsConfiguration();
+    final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+        .numDataNodes(4).build();
+    try {
+      cluster.waitActive();
+      final DistributedFileSystem dfs = cluster.getFileSystem();
+      final String file = "/testScheduleSameBlock/file";
+      
+      {
+        final FSDataOutputStream out = dfs.create(new Path(file));
+        out.writeChars("testScheduleSameBlock");
+        out.close();
+      }
+
+      final Mover mover = newMover(conf);
+      mover.init();
+      final Mover.Processor processor = mover.new Processor();
+
+      final LocatedBlock lb = dfs.getClient().getLocatedBlocks(file, 0).get(0);
+      final List<MLocation> locations = MLocation.toLocations(lb);
+      final MLocation ml = locations.get(0);
+      final DBlock db = mover.newDBlock(lb.getBlock().getLocalBlock(), locations);
+
+      final List<StorageType> storageTypes = new ArrayList<StorageType>(
+          Arrays.asList(StorageType.DEFAULT, StorageType.DEFAULT));
+      Assert.assertTrue(processor.scheduleMoveReplica(db, ml, storageTypes));
+      Assert.assertFalse(processor.scheduleMoveReplica(db, ml, storageTypes));
+    } finally {
+      cluster.shutdown();
+    }
+  }
+
+  private void checkMovePaths(List<Path> actual, Path... expected) {
+    Assert.assertEquals(expected.length, actual.size());
+    for (Path p : expected) {
+      Assert.assertTrue(actual.contains(p));
+    }
+  }
+
+  /**
+   * Test Mover Cli by specifying a list of files/directories using option "-p".
+   * There is only one namenode (and hence name service) specified in the conf.
+   */
+  @Test
+  public void testMoverCli() throws Exception {
+    final MiniDFSCluster cluster = new MiniDFSCluster
+        .Builder(new HdfsConfiguration()).numDataNodes(0).build();
+    try {
+      final Configuration conf = cluster.getConfiguration(0);
+      try {
+        Mover.Cli.getNameNodePathsToMove(conf, "-p", "/foo", "bar");
+        Assert.fail("Expected exception for illegal path bar");
+      } catch (IllegalArgumentException e) {
+        GenericTestUtils.assertExceptionContains("bar is not absolute", e);
+      }
+
+      Map<URI, List<Path>> movePaths = Mover.Cli.getNameNodePathsToMove(conf);
+      Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+      Assert.assertEquals(1, namenodes.size());
+      Assert.assertEquals(1, movePaths.size());
+      URI nn = namenodes.iterator().next();
+      Assert.assertTrue(movePaths.containsKey(nn));
+      Assert.assertNull(movePaths.get(nn));
+
+      movePaths = Mover.Cli.getNameNodePathsToMove(conf, "-p", "/foo", "/bar");
+      namenodes = DFSUtil.getNsServiceRpcUris(conf);
+      Assert.assertEquals(1, movePaths.size());
+      nn = namenodes.iterator().next();
+      Assert.assertTrue(movePaths.containsKey(nn));
+      checkMovePaths(movePaths.get(nn), new Path("/foo"), new Path("/bar"));
+    } finally {
+      cluster.shutdown();
+    }
+  }
+
+  @Test
+  public void testMoverCliWithHAConf() throws Exception {
+    final Configuration conf = new HdfsConfiguration();
+    final MiniDFSCluster cluster = new MiniDFSCluster
+        .Builder(new HdfsConfiguration())
+        .nnTopology(MiniDFSNNTopology.simpleHATopology())
+        .numDataNodes(0).build();
+    HATestUtil.setFailoverConfigurations(cluster, conf, "MyCluster");
+    try {
+      Map<URI, List<Path>> movePaths = Mover.Cli.getNameNodePathsToMove(conf,
+          "-p", "/foo", "/bar");
+      Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+      Assert.assertEquals(1, namenodes.size());
+      Assert.assertEquals(1, movePaths.size());
+      URI nn = namenodes.iterator().next();
+      Assert.assertEquals(new URI("hdfs://MyCluster"), nn);
+      Assert.assertTrue(movePaths.containsKey(nn));
+      checkMovePaths(movePaths.get(nn), new Path("/foo"), new Path("/bar"));
+    } finally {
+      cluster.shutdown();
+    }
+  }
+
+  @Test
+  public void testMoverCliWithFederation() throws Exception {
+    final MiniDFSCluster cluster = new MiniDFSCluster
+        .Builder(new HdfsConfiguration())
+        .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(3))
+        .numDataNodes(0).build();
+    final Configuration conf = new HdfsConfiguration();
+    DFSTestUtil.setFederatedConfiguration(cluster, conf);
+    try {
+      Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+      Assert.assertEquals(3, namenodes.size());
+
+      try {
+        Mover.Cli.getNameNodePathsToMove(conf, "-p", "/foo");
+        Assert.fail("Expect exception for missing authority information");
+      } catch (IllegalArgumentException e) {
+        GenericTestUtils.assertExceptionContains(
+            "does not contain scheme and authority", e);
+      }
+
+      try {
+        Mover.Cli.getNameNodePathsToMove(conf, "-p", "hdfs:///foo");
+        Assert.fail("Expect exception for missing authority information");
+      } catch (IllegalArgumentException e) {
+        GenericTestUtils.assertExceptionContains(
+            "does not contain scheme and authority", e);
+      }
+
+      try {
+        Mover.Cli.getNameNodePathsToMove(conf, "-p", "wrong-hdfs://ns1/foo");
+        Assert.fail("Expect exception for wrong scheme");
+      } catch (IllegalArgumentException e) {
+        GenericTestUtils.assertExceptionContains("Cannot resolve the path", e);
+      }
+
+      Iterator<URI> iter = namenodes.iterator();
+      URI nn1 = iter.next();
+      URI nn2 = iter.next();
+      Map<URI, List<Path>> movePaths = Mover.Cli.getNameNodePathsToMove(conf,
+          "-p", nn1 + "/foo", nn1 + "/bar", nn2 + "/foo/bar");
+      Assert.assertEquals(2, movePaths.size());
+      checkMovePaths(movePaths.get(nn1), new Path("/foo"), new Path("/bar"));
+      checkMovePaths(movePaths.get(nn2), new Path("/foo/bar"));
+    } finally {
+      cluster.shutdown();
+    }
+  }
+
+  @Test
+  public void testMoverCliWithFederationHA() throws Exception {
+    final MiniDFSCluster cluster = new MiniDFSCluster
+        .Builder(new HdfsConfiguration())
+        .nnTopology(MiniDFSNNTopology.simpleHAFederatedTopology(3))
+        .numDataNodes(0).build();
+    final Configuration conf = new HdfsConfiguration();
+    DFSTestUtil.setFederatedHAConfiguration(cluster, conf);
+    try {
+      Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+      Assert.assertEquals(3, namenodes.size());
+
+      Iterator<URI> iter = namenodes.iterator();
+      URI nn1 = iter.next();
+      URI nn2 = iter.next();
+      URI nn3 = iter.next();
+      Map<URI, List<Path>> movePaths = Mover.Cli.getNameNodePathsToMove(conf,
+          "-p", nn1 + "/foo", nn1 + "/bar", nn2 + "/foo/bar", nn3 + "/foobar");
+      Assert.assertEquals(3, movePaths.size());
+      checkMovePaths(movePaths.get(nn1), new Path("/foo"), new Path("/bar"));
+      checkMovePaths(movePaths.get(nn2), new Path("/foo/bar"));
+      checkMovePaths(movePaths.get(nn3), new Path("/foobar"));
+    } finally {
+       cluster.shutdown();
+    }
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestStorageMover.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestStorageMover.java
@ -0,0 +1,766 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.mover;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSOutputStream;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.StorageType;
+import org.apache.hadoop.hdfs.protocol.DirectoryListing;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
+import org.apache.hadoop.hdfs.protocol.HdfsLocatedFileStatus;
+import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
+import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
+import org.apache.hadoop.hdfs.server.balancer.Dispatcher;
+import org.apache.hadoop.hdfs.server.balancer.ExitStatus;
+import org.apache.hadoop.hdfs.server.balancer.TestBalancer;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
+import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotTestHelper;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.log4j.Level;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Maps;
+
+/**
+ * Test the data migration tool (for Archival Storage)
+ */
+public class TestStorageMover {
+  static final Log LOG = LogFactory.getLog(TestStorageMover.class);
+  static {
+    ((Log4JLogger)LogFactory.getLog(BlockPlacementPolicy.class)
+        ).getLogger().setLevel(Level.ALL);
+    ((Log4JLogger)LogFactory.getLog(Dispatcher.class)
+        ).getLogger().setLevel(Level.ALL);
+    ((Log4JLogger)LogFactory.getLog(DataTransferProtocol.class)).getLogger()
+        .setLevel(Level.ALL);
+  }
+
+  private static final int BLOCK_SIZE = 1024;
+  private static final short REPL = 3;
+  private static final int NUM_DATANODES = 6;
+  private static final Configuration DEFAULT_CONF = new HdfsConfiguration();
+  private static final BlockStoragePolicy.Suite DEFAULT_POLICIES;
+  private static final BlockStoragePolicy HOT;
+  private static final BlockStoragePolicy WARM;
+  private static final BlockStoragePolicy COLD;
+
+  static {
+    DEFAULT_CONF.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+    DEFAULT_CONF.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1L);
+    DEFAULT_CONF.setLong(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY,
+        2L);
+    DEFAULT_CONF.setLong(DFSConfigKeys.DFS_MOVER_MOVEDWINWIDTH_KEY, 2000L);
+
+    DEFAULT_POLICIES = BlockStoragePolicy.readBlockStorageSuite(DEFAULT_CONF);
+    HOT = DEFAULT_POLICIES.getPolicy("HOT");
+    WARM = DEFAULT_POLICIES.getPolicy("WARM");
+    COLD = DEFAULT_POLICIES.getPolicy("COLD");
+    TestBalancer.initTestSetup();
+    Dispatcher.setDelayAfterErrors(1000L);
+  }
+
+  /**
+   * This scheme defines files/directories and their block storage policies. It
+   * also defines snapshots.
+   */
+  static class NamespaceScheme {
+    final List<Path> dirs;
+    final List<Path> files;
+    final long fileSize;
+    final Map<Path, List<String>> snapshotMap;
+    final Map<Path, BlockStoragePolicy> policyMap;
+
+    NamespaceScheme(List<Path> dirs, List<Path> files, long fileSize, 
+                    Map<Path,List<String>> snapshotMap,
+                    Map<Path, BlockStoragePolicy> policyMap) {
+      this.dirs = dirs == null? Collections.<Path>emptyList(): dirs;
+      this.files = files == null? Collections.<Path>emptyList(): files;
+      this.fileSize = fileSize;
+      this.snapshotMap = snapshotMap == null ?
+          Collections.<Path, List<String>>emptyMap() : snapshotMap;
+      this.policyMap = policyMap;
+    }
+
+    /**
+     * Create files/directories/snapshots.
+     */
+    void prepare(DistributedFileSystem dfs, short repl) throws Exception {
+      for (Path d : dirs) {
+        dfs.mkdirs(d);
+      }
+      for (Path file : files) {
+        DFSTestUtil.createFile(dfs, file, fileSize, repl, 0L);
+      }
+      for (Map.Entry<Path, List<String>> entry : snapshotMap.entrySet()) {
+        for (String snapshot : entry.getValue()) {
+          SnapshotTestHelper.createSnapshot(dfs, entry.getKey(), snapshot);
+        }
+      }
+    }
+
+    /**
+     * Set storage policies according to the corresponding scheme.
+     */
+    void setStoragePolicy(DistributedFileSystem dfs) throws Exception {
+      for (Map.Entry<Path, BlockStoragePolicy> entry : policyMap.entrySet()) {
+        dfs.setStoragePolicy(entry.getKey(), entry.getValue().getName());
+      }
+    }
+  }
+
+  /**
+   * This scheme defines DataNodes and their storage, including storage types
+   * and remaining capacities.
+   */
+  static class ClusterScheme {
+    final Configuration conf;
+    final int numDataNodes;
+    final short repl;
+    final StorageType[][] storageTypes;
+    final long[][] storageCapacities;
+
+    ClusterScheme() {
+      this(DEFAULT_CONF, NUM_DATANODES, REPL,
+          genStorageTypes(NUM_DATANODES), null);
+    }
+
+    ClusterScheme(Configuration conf, int numDataNodes, short repl,
+        StorageType[][] types, long[][] capacities) {
+      Preconditions.checkArgument(types == null || types.length == numDataNodes);
+      Preconditions.checkArgument(capacities == null || capacities.length ==
+          numDataNodes);
+      this.conf = conf;
+      this.numDataNodes = numDataNodes;
+      this.repl = repl;
+      this.storageTypes = types;
+      this.storageCapacities = capacities;
+    }
+  }
+
+  class MigrationTest {
+    private final ClusterScheme clusterScheme;
+    private final NamespaceScheme nsScheme;
+    private final Configuration conf;
+
+    private MiniDFSCluster cluster;
+    private DistributedFileSystem dfs;
+    private final BlockStoragePolicy.Suite policies;
+
+    MigrationTest(ClusterScheme cScheme, NamespaceScheme nsScheme) {
+      this.clusterScheme = cScheme;
+      this.nsScheme = nsScheme;
+      this.conf = clusterScheme.conf;
+      this.policies = BlockStoragePolicy.readBlockStorageSuite(conf);
+    }
+
+    /**
+     * Set up the cluster and start NameNode and DataNodes according to the
+     * corresponding scheme.
+     */
+    void setupCluster() throws Exception {
+      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(clusterScheme
+          .numDataNodes).storageTypes(clusterScheme.storageTypes)
+          .storageCapacities(clusterScheme.storageCapacities).build();
+      cluster.waitActive();
+      dfs = cluster.getFileSystem();
+    }
+
+    private void runBasicTest(boolean shutdown) throws Exception {
+      setupCluster();
+      try {
+        prepareNamespace();
+        verify(true);
+
+        setStoragePolicy();
+        migrate();
+        verify(true);
+      } finally {
+        if (shutdown) {
+          shutdownCluster();
+        }
+      }
+    }
+
+    void shutdownCluster() throws Exception {
+      IOUtils.cleanup(null, dfs);
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+    }
+
+    /**
+     * Create files/directories and set their storage policies according to the
+     * corresponding scheme.
+     */
+    void prepareNamespace() throws Exception {
+      nsScheme.prepare(dfs, clusterScheme.repl);
+    }
+
+    void setStoragePolicy() throws Exception {
+      nsScheme.setStoragePolicy(dfs);
+    }
+
+    /**
+     * Run the migration tool.
+     */
+    void migrate() throws Exception {
+      runMover();
+      Thread.sleep(5000); // let the NN finish deletion
+    }
+
+    /**
+     * Verify block locations after running the migration tool.
+     */
+    void verify(boolean verifyAll) throws Exception {
+      for (DataNode dn : cluster.getDataNodes()) {
+        DataNodeTestUtils.triggerBlockReport(dn);
+      }
+      if (verifyAll) {
+        verifyNamespace();
+      } else {
+        // TODO verify according to the given path list
+
+      }
+    }
+
+    private void runMover() throws Exception {
+      Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+      Map<URI, List<Path>> nnMap = Maps.newHashMap();
+      for (URI nn : namenodes) {
+        nnMap.put(nn, null);
+      }
+      int result = Mover.run(nnMap, conf);
+      Assert.assertEquals(ExitStatus.SUCCESS.getExitCode(), result);
+    }
+
+    private void verifyNamespace() throws Exception {
+      HdfsFileStatus status = dfs.getClient().getFileInfo("/");
+      verifyRecursively(null, status);
+    }
+
+    private void verifyRecursively(final Path parent,
+        final HdfsFileStatus status) throws Exception {
+      if (status.isDir()) {
+        Path fullPath = parent == null ?
+            new Path("/") : status.getFullPath(parent);
+        DirectoryListing children = dfs.getClient().listPaths(
+            fullPath.toString(), HdfsFileStatus.EMPTY_NAME, true);
+        for (HdfsFileStatus child : children.getPartialListing()) {
+          verifyRecursively(fullPath, child);
+        }
+      } else if (!status.isSymlink()) { // is file
+        verifyFile(parent, status, null);
+      }
+    }
+
+    void verifyFile(final Path file, final Byte expectedPolicyId)
+        throws Exception {
+      final Path parent = file.getParent();
+      DirectoryListing children = dfs.getClient().listPaths(
+          parent.toString(), HdfsFileStatus.EMPTY_NAME, true);
+      for (HdfsFileStatus child : children.getPartialListing()) {
+        if (child.getLocalName().equals(file.getName())) {
+          verifyFile(parent,  child, expectedPolicyId);
+          return;
+        }
+      }
+      Assert.fail("File " + file + " not found.");
+    }
+
+    private void verifyFile(final Path parent, final HdfsFileStatus status,
+        final Byte expectedPolicyId) throws Exception {
+      HdfsLocatedFileStatus fileStatus = (HdfsLocatedFileStatus) status;
+      byte policyId = fileStatus.getStoragePolicy();
+      BlockStoragePolicy policy = policies.getPolicy(policyId);
+      if (expectedPolicyId != null) {
+        Assert.assertEquals((byte)expectedPolicyId, policy.getId());
+      }
+      final List<StorageType> types = policy.chooseStorageTypes(
+          status.getReplication());
+      for(LocatedBlock lb : fileStatus.getBlockLocations().getLocatedBlocks()) {
+        final Mover.StorageTypeDiff diff = new Mover.StorageTypeDiff(types,
+            lb.getStorageTypes());
+        Assert.assertTrue(fileStatus.getFullName(parent.toString())
+            + " with policy " + policy + " has non-empty overlap: " + diff
+            + ", the corresponding block is " + lb.getBlock().getLocalBlock(),
+            diff.removeOverlap());
+      }
+    }
+    
+    Replication getReplication(Path file) throws IOException {
+      return getOrVerifyReplication(file, null);
+    }
+
+    Replication verifyReplication(Path file, int expectedDiskCount,
+        int expectedArchiveCount) throws IOException {
+      final Replication r = new Replication();
+      r.disk = expectedDiskCount;
+      r.archive = expectedArchiveCount;
+      return getOrVerifyReplication(file, r);
+    }
+
+    private Replication getOrVerifyReplication(Path file, Replication expected)
+        throws IOException {
+      final List<LocatedBlock> lbs = dfs.getClient().getLocatedBlocks(
+          file.toString(), 0).getLocatedBlocks();
+      Assert.assertEquals(1, lbs.size());
+
+      LocatedBlock lb = lbs.get(0);
+      StringBuilder types = new StringBuilder(); 
+      final Replication r = new Replication();
+      for(StorageType t : lb.getStorageTypes()) {
+        types.append(t).append(", ");
+        if (t == StorageType.DISK) {
+          r.disk++;
+        } else if (t == StorageType.ARCHIVE) {
+          r.archive++;
+        } else {
+          Assert.fail("Unexpected storage type " + t);
+        }
+      }
+
+      if (expected != null) {
+        final String s = "file = " + file + "\n  types = [" + types + "]";
+        Assert.assertEquals(s, expected, r);
+      }
+      return r;
+    }
+  }
+
+  static class Replication {
+    int disk;
+    int archive;
+    
+    @Override
+    public int hashCode() {
+      return disk ^ archive;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (obj == this) {
+        return true;
+      } else if (obj == null || !(obj instanceof Replication)) {
+        return false;
+      }
+      final Replication that = (Replication)obj;
+      return this.disk == that.disk && this.archive == that.archive;
+    }
+    
+    @Override
+    public String toString() {
+      return "[disk=" + disk + ", archive=" + archive + "]";
+    }
+  }
+
+  private static StorageType[][] genStorageTypes(int numDataNodes) {
+    return genStorageTypes(numDataNodes, 0, 0);
+  }
+
+  private static StorageType[][] genStorageTypes(int numDataNodes,
+      int numAllDisk, int numAllArchive) {
+    StorageType[][] types = new StorageType[numDataNodes][];
+    int i = 0;
+    for (; i < numAllDisk; i++) {
+      types[i] = new StorageType[]{StorageType.DISK, StorageType.DISK};
+    }
+    for (; i < numAllDisk + numAllArchive; i++) {
+      types[i] = new StorageType[]{StorageType.ARCHIVE, StorageType.ARCHIVE};
+    }
+    for (; i < types.length; i++) {
+      types[i] = new StorageType[]{StorageType.DISK, StorageType.ARCHIVE};
+    }
+    return types;
+  }
+  
+  private static long[][] genCapacities(int nDatanodes, int numAllDisk,
+      int numAllArchive, long diskCapacity, long archiveCapacity) {
+    final long[][] capacities = new long[nDatanodes][];
+    int i = 0;
+    for (; i < numAllDisk; i++) {
+      capacities[i] = new long[]{diskCapacity, diskCapacity};
+    }
+    for (; i < numAllDisk + numAllArchive; i++) {
+      capacities[i] = new long[]{archiveCapacity, archiveCapacity};
+    }
+    for(; i < capacities.length; i++) {
+      capacities[i] = new long[]{diskCapacity, archiveCapacity};
+    }
+    return capacities;
+  }
+
+  private static class PathPolicyMap {
+    final Map<Path, BlockStoragePolicy> map = Maps.newHashMap();
+    final Path hot = new Path("/hot");
+    final Path warm = new Path("/warm");
+    final Path cold = new Path("/cold");
+    final List<Path> files;
+
+    PathPolicyMap(int filesPerDir){
+      map.put(hot, HOT);
+      map.put(warm, WARM);
+      map.put(cold, COLD);
+      files = new ArrayList<Path>();
+      for(Path dir : map.keySet()) {
+        for(int i = 0; i < filesPerDir; i++) {
+          files.add(new Path(dir, "file" + i));
+        }
+      }
+    }
+
+    NamespaceScheme newNamespaceScheme() {
+      return new NamespaceScheme(Arrays.asList(hot, warm, cold),
+          files, BLOCK_SIZE/2, null, map);
+    }
+
+    /**
+     * Move hot files to warm and cold, warm files to hot and cold,
+     * and cold files to hot and warm.
+     */
+    void moveAround(DistributedFileSystem dfs) throws Exception {
+      for(Path srcDir : map.keySet()) {
+        int i = 0;
+        for(Path dstDir : map.keySet()) {
+          if (!srcDir.equals(dstDir)) {
+            final Path src = new Path(srcDir, "file" + i++);
+            final Path dst = new Path(dstDir, srcDir.getName() + "2" + dstDir.getName());
+            LOG.info("rename " + src + " to " + dst);
+            dfs.rename(src, dst);
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * A normal case for Mover: move a file into archival storage
+   */
+  @Test
+  public void testMigrateFileToArchival() throws Exception {
+    LOG.info("testMigrateFileToArchival");
+    final Path foo = new Path("/foo");
+    Map<Path, BlockStoragePolicy> policyMap = Maps.newHashMap();
+    policyMap.put(foo, COLD);
+    NamespaceScheme nsScheme = new NamespaceScheme(null, Arrays.asList(foo),
+        2*BLOCK_SIZE, null, policyMap);
+    ClusterScheme clusterScheme = new ClusterScheme(DEFAULT_CONF,
+        NUM_DATANODES, REPL, genStorageTypes(NUM_DATANODES), null);
+    new MigrationTest(clusterScheme, nsScheme).runBasicTest(true);
+  }
+
+  /**
+   * Print a big banner in the test log to make debug easier.
+   */
+  static void banner(String string) {
+    LOG.info("\n\n\n\n================================================\n" +
+        string + "\n" +
+        "==================================================\n\n");
+  }
+
+  /**
+   * Move an open file into archival storage
+   */
+  @Test
+  public void testMigrateOpenFileToArchival() throws Exception {
+    LOG.info("testMigrateOpenFileToArchival");
+    final Path fooDir = new Path("/foo");
+    Map<Path, BlockStoragePolicy> policyMap = Maps.newHashMap();
+    policyMap.put(fooDir, COLD);
+    NamespaceScheme nsScheme = new NamespaceScheme(Arrays.asList(fooDir), null,
+        BLOCK_SIZE, null, policyMap);
+    ClusterScheme clusterScheme = new ClusterScheme(DEFAULT_CONF,
+        NUM_DATANODES, REPL, genStorageTypes(NUM_DATANODES), null);
+    MigrationTest test = new MigrationTest(clusterScheme, nsScheme);
+    test.setupCluster();
+
+    // create an open file
+    banner("writing to file /foo/bar");
+    final Path barFile = new Path(fooDir, "bar");
+    DFSTestUtil.createFile(test.dfs, barFile, BLOCK_SIZE, (short) 1, 0L);
+    FSDataOutputStream out = test.dfs.append(barFile);
+    out.writeBytes("hello, ");
+    ((DFSOutputStream) out.getWrappedStream()).hsync();
+
+    try {
+      banner("start data migration");
+      test.setStoragePolicy(); // set /foo to COLD
+      test.migrate();
+
+      // make sure the under construction block has not been migrated
+      LocatedBlocks lbs = test.dfs.getClient().getLocatedBlocks(
+          barFile.toString(), BLOCK_SIZE);
+      LOG.info("Locations: " + lbs);
+      List<LocatedBlock> blks = lbs.getLocatedBlocks();
+      Assert.assertEquals(1, blks.size());
+      Assert.assertEquals(1, blks.get(0).getLocations().length);
+
+      banner("finish the migration, continue writing");
+      // make sure the writing can continue
+      out.writeBytes("world!");
+      ((DFSOutputStream) out.getWrappedStream()).hsync();
+      IOUtils.cleanup(LOG, out);
+
+      lbs = test.dfs.getClient().getLocatedBlocks(
+          barFile.toString(), BLOCK_SIZE);
+      LOG.info("Locations: " + lbs);
+      blks = lbs.getLocatedBlocks();
+      Assert.assertEquals(1, blks.size());
+      Assert.assertEquals(1, blks.get(0).getLocations().length);
+
+      banner("finish writing, starting reading");
+      // check the content of /foo/bar
+      FSDataInputStream in = test.dfs.open(barFile);
+      byte[] buf = new byte[13];
+      // read from offset 1024
+      in.readFully(BLOCK_SIZE, buf, 0, buf.length);
+      IOUtils.cleanup(LOG, in);
+      Assert.assertEquals("hello, world!", new String(buf));
+    } finally {
+      test.shutdownCluster();
+    }
+  }
+
+  /**
+   * Test directories with Hot, Warm and Cold polices.
+   */
+  @Test
+  public void testHotWarmColdDirs() throws Exception {
+    LOG.info("testHotWarmColdDirs");
+    PathPolicyMap pathPolicyMap = new PathPolicyMap(3);
+    NamespaceScheme nsScheme = pathPolicyMap.newNamespaceScheme();
+    ClusterScheme clusterScheme = new ClusterScheme();
+    MigrationTest test = new MigrationTest(clusterScheme, nsScheme);
+
+    try {
+      test.runBasicTest(false);
+      pathPolicyMap.moveAround(test.dfs);
+      test.migrate();
+
+      test.verify(true);
+    } finally {
+      test.shutdownCluster();
+    }
+  }
+
+  private void waitForAllReplicas(int expectedReplicaNum, Path file,
+      DistributedFileSystem dfs) throws Exception {
+    for (int i = 0; i < 5; i++) {
+      LocatedBlocks lbs = dfs.getClient().getLocatedBlocks(file.toString(), 0,
+          BLOCK_SIZE);
+      LocatedBlock lb = lbs.get(0);
+      if (lb.getLocations().length >= expectedReplicaNum) {
+        return;
+      } else {
+        Thread.sleep(1000);
+      }
+    }
+  }
+
+  /**
+   * Test DISK is running out of spaces.
+   */
+  @Test
+  public void testNoSpaceDisk() throws Exception {
+    LOG.info("testNoSpaceDisk");
+    final PathPolicyMap pathPolicyMap = new PathPolicyMap(0);
+    final NamespaceScheme nsScheme = pathPolicyMap.newNamespaceScheme();
+
+    final long diskCapacity = (6 + HdfsConstants.MIN_BLOCKS_FOR_WRITE)
+        * BLOCK_SIZE;
+    final long archiveCapacity = 100 * BLOCK_SIZE;
+    final long[][] capacities = genCapacities(NUM_DATANODES, 1, 1,
+        diskCapacity, archiveCapacity);
+    Configuration conf = new Configuration(DEFAULT_CONF);
+    final ClusterScheme clusterScheme = new ClusterScheme(conf,
+        NUM_DATANODES, REPL, genStorageTypes(NUM_DATANODES, 1, 1), capacities);
+    final MigrationTest test = new MigrationTest(clusterScheme, nsScheme);
+
+    try {
+      test.runBasicTest(false);
+
+      // create hot files with replication 3 until not more spaces.
+      final short replication = 3;
+      {
+        int hotFileCount = 0;
+        try {
+          for (; ; hotFileCount++) {
+            final Path p = new Path(pathPolicyMap.hot, "file" + hotFileCount);
+            DFSTestUtil.createFile(test.dfs, p, BLOCK_SIZE, replication, 0L);
+            waitForAllReplicas(replication, p, test.dfs);
+          }
+        } catch (IOException e) {
+          LOG.info("Expected: hotFileCount=" + hotFileCount, e);
+        }
+        Assert.assertTrue(hotFileCount >= 1);
+      }
+
+      // create hot files with replication 1 to use up all remaining spaces.
+      {
+        int hotFileCount_r1 = 0;
+        try {
+          for (; ; hotFileCount_r1++) {
+            final Path p = new Path(pathPolicyMap.hot, "file_r1_" + hotFileCount_r1);
+            DFSTestUtil.createFile(test.dfs, p, BLOCK_SIZE, (short) 1, 0L);
+            waitForAllReplicas(1, p, test.dfs);
+          }
+        } catch (IOException e) {
+          LOG.info("Expected: hotFileCount_r1=" + hotFileCount_r1, e);
+        }
+      }
+
+      { // test increasing replication.  Since DISK is full,
+        // new replicas should be stored in ARCHIVE as a fallback storage.
+        final Path file0 = new Path(pathPolicyMap.hot, "file0");
+        final Replication r = test.getReplication(file0);
+        final short newReplication = (short) 5;
+        test.dfs.setReplication(file0, newReplication);
+        Thread.sleep(10000);
+        test.verifyReplication(file0, r.disk, newReplication - r.disk);
+      }
+
+      { // test creating a cold file and then increase replication
+        final Path p = new Path(pathPolicyMap.cold, "foo");
+        DFSTestUtil.createFile(test.dfs, p, BLOCK_SIZE, replication, 0L);
+        test.verifyReplication(p, 0, replication);
+
+        final short newReplication = 5;
+        test.dfs.setReplication(p, newReplication);
+        Thread.sleep(10000);
+        test.verifyReplication(p, 0, newReplication);
+      }
+
+      { //test move a hot file to warm
+        final Path file1 = new Path(pathPolicyMap.hot, "file1");
+        test.dfs.rename(file1, pathPolicyMap.warm);
+        test.migrate();
+        test.verifyFile(new Path(pathPolicyMap.warm, "file1"), WARM.getId());
+      }
+    } finally {
+      test.shutdownCluster();
+    }
+  }
+
+  /**
+   * Test ARCHIVE is running out of spaces.
+   */
+  @Test
+  public void testNoSpaceArchive() throws Exception {
+    LOG.info("testNoSpaceArchive");
+    final PathPolicyMap pathPolicyMap = new PathPolicyMap(0);
+    final NamespaceScheme nsScheme = pathPolicyMap.newNamespaceScheme();
+
+    final long diskCapacity = 100 * BLOCK_SIZE;
+    final long archiveCapacity = (6 + HdfsConstants.MIN_BLOCKS_FOR_WRITE)
+        * BLOCK_SIZE;
+    final long[][] capacities = genCapacities(NUM_DATANODES, 1, 1,
+        diskCapacity, archiveCapacity);
+    final ClusterScheme clusterScheme = new ClusterScheme(DEFAULT_CONF,
+        NUM_DATANODES, REPL, genStorageTypes(NUM_DATANODES, 1, 1), capacities);
+    final MigrationTest test = new MigrationTest(clusterScheme, nsScheme);
+
+    try {
+      test.runBasicTest(false);
+
+      // create cold files with replication 3 until not more spaces.
+      final short replication = 3;
+      {
+        int coldFileCount = 0;
+        try {
+          for (; ; coldFileCount++) {
+            final Path p = new Path(pathPolicyMap.cold, "file" + coldFileCount);
+            DFSTestUtil.createFile(test.dfs, p, BLOCK_SIZE, replication, 0L);
+            waitForAllReplicas(replication, p, test.dfs);
+          }
+        } catch (IOException e) {
+          LOG.info("Expected: coldFileCount=" + coldFileCount, e);
+        }
+        Assert.assertTrue(coldFileCount >= 1);
+      }
+
+      // create cold files with replication 1 to use up all remaining spaces.
+      {
+        int coldFileCount_r1 = 0;
+        try {
+          for (; ; coldFileCount_r1++) {
+            final Path p = new Path(pathPolicyMap.cold, "file_r1_" + coldFileCount_r1);
+            DFSTestUtil.createFile(test.dfs, p, BLOCK_SIZE, (short) 1, 0L);
+            waitForAllReplicas(1, p, test.dfs);
+          }
+        } catch (IOException e) {
+          LOG.info("Expected: coldFileCount_r1=" + coldFileCount_r1, e);
+        }
+      }
+
+      { // test increasing replication but new replicas cannot be created
+        // since no more ARCHIVE space.
+        final Path file0 = new Path(pathPolicyMap.cold, "file0");
+        final Replication r = test.getReplication(file0);
+        LOG.info("XXX " + file0 + ": replication=" + r);
+        Assert.assertEquals(0, r.disk);
+
+        final short newReplication = (short) 5;
+        test.dfs.setReplication(file0, newReplication);
+        Thread.sleep(10000);
+
+        test.verifyReplication(file0, 0, r.archive);
+      }
+
+      { // test creating a hot file
+        final Path p = new Path(pathPolicyMap.hot, "foo");
+        DFSTestUtil.createFile(test.dfs, p, BLOCK_SIZE, (short) 3, 0L);
+      }
+
+      { //test move a cold file to warm
+        final Path file1 = new Path(pathPolicyMap.cold, "file1");
+        test.dfs.rename(file1, pathPolicyMap.warm);
+        test.migrate();
+        test.verify(true);
+      }
+    } finally {
+      test.shutdownCluster();
+    }
+  }
+}
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestAddBlockRetry.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestAddBlockRetry.java
@ -121,9 +121,9 @@ public class TestAddBlockRetry {
        }
        return ret;
      }
-    }).when(spyBM).chooseTarget(Mockito.anyString(), Mockito.anyInt(),
+    }).when(spyBM).chooseTarget4NewBlock(Mockito.anyString(), Mockito.anyInt(),
        Mockito.<DatanodeDescriptor>any(), Mockito.<HashSet<Node>>any(),
-        Mockito.anyLong(), Mockito.<List<String>>any());
+        Mockito.anyLong(), Mockito.<List<String>>any(), Mockito.anyByte());

    // create file
    nn.create(src, FsPermission.getFileDefault(),
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeleteRace.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeleteRace.java
@ -29,13 +29,13 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.BlockStoragePolicy;
 import org.apache.hadoop.hdfs.AppendTestUtil;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
 import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.hdfs.StorageType;
 import org.apache.hadoop.hdfs.protocol.DatanodeID;
 import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
 import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
@ -121,10 +121,10 @@ public class TestDeleteRace {
                                      boolean returnChosenNodes,
                                      Set<Node> excludedNodes,
                                      long blocksize,
-                                      StorageType storageType) {
+                                      final BlockStoragePolicy storagePolicy) {
      DatanodeStorageInfo[] results = super.chooseTarget(srcPath,
          numOfReplicas, writer, chosenNodes, returnChosenNodes, excludedNodes,
-          blocksize, storageType);
+          blocksize, storagePolicy);
      try {
        Thread.sleep(3000);
      } catch (InterruptedException e) {}
--- a/Show More
+++ b/Show More