Merge r1234388 through r1236385 from 0.23.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23-PB@1236395 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2012-01-26 21:37:26 +00:00
commit 359c746ca7
131 changed files with 12719 additions and 394 deletions

View File

@ -127,6 +127,17 @@
<unpack>false</unpack>
</binaries>
</moduleSet>
<moduleSet>
<includes>
<include>org.apache.hadoop:hadoop-mapreduce-client-jobclient</include>
</includes>
<binaries>
<attachmentClassifier>tests</attachmentClassifier>
<outputDirectory>share/hadoop/${hadoop.component}</outputDirectory>
<includeDependencies>false</includeDependencies>
<unpack>false</unpack>
</binaries>
</moduleSet>
</moduleSets>
<dependencySets>
<dependencySet>

View File

@ -125,6 +125,21 @@ Release 0.23.1 - Unreleased
HADOOP-7975. Add LZ4 as an entry in the default codec list, missed by HADOOP-7657 (harsh)
HADOOP-7987. Support setting the run-as user in unsecure mode. (jitendra)
HADOOP-4515. Configuration#getBoolean must not be case sensitive. (Sho Shimauchi via harsh)
HADOOP-6490. Use StringUtils over String#replace in Path#normalizePath.
(Uma Maheswara Rao G via harsh)
HADOOP-7574. Improve FSShell -stat, add user/group elements.
(XieXianshan via harsh)
HADOOP-7736. Remove duplicate Path#normalizePath call. (harsh)
HADOOP-7919. Remove the unused hadoop.logfile.* properties from the
core-default.xml file. (harsh)
OPTIMIZATIONS
BUG FIXES
@ -207,6 +222,9 @@ Release 0.23.1 - Unreleased
HADOOP-7986. Adding config for MapReduce History Server protocol in
hadoop-policy.xml for service level authorization. (Mahadev Konar via vinodkv)
HADOOP-7981. Improve documentation for org.apache.hadoop.io.compress.
Decompressor.getRemaining (Jonathan Eagles via mahadev)
Release 0.23.0 - 2011-11-01
INCOMPATIBLE CHANGES

View File

@ -753,11 +753,6 @@
<section>
<title> secondarynamenode </title>
<note>
The Secondary NameNode has been deprecated. Instead, consider using the
<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Checkpoint+Node">Checkpoint Node</a> or
<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Backup+Node">Backup Node</a>.
</note>
<p>
Runs the HDFS secondary
namenode. See <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Secondary+NameNode">Secondary NameNode</a>

View File

@ -826,6 +826,12 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
*/
public boolean getBoolean(String name, boolean defaultValue) {
String valueString = getTrimmed(name);
if (null == valueString || "".equals(valueString)) {
return defaultValue;
}
valueString = valueString.toLowerCase();
if ("true".equals(valueString))
return true;
else if ("false".equals(valueString))

View File

@ -18,10 +18,12 @@
package org.apache.hadoop.fs;
import java.net.*;
import java.io.*;
import org.apache.avro.reflect.Stringable;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.avro.reflect.Stringable;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@ -76,7 +78,7 @@ public class Path implements Comparable {
}
URI resolved = parentUri.resolve(child.uri);
initialize(resolved.getScheme(), resolved.getAuthority(),
normalizePath(resolved.getPath()), resolved.getFragment());
resolved.getPath(), resolved.getFragment());
}
private void checkPathArg( String path ) {
@ -158,8 +160,8 @@ public class Path implements Comparable {
private String normalizePath(String path) {
// remove double slashes & backslashes
path = path.replace("//", "/");
path = path.replace("\\", "/");
path = StringUtils.replace(path, "//", "/");
path = StringUtils.replace(path, "\\", "/");
// trim trailing slash from non-root path (ignoring windows drive)
int minLength = hasWindowsDrive(path, true) ? 4 : 1;

View File

@ -32,9 +32,11 @@ import org.apache.hadoop.fs.FileStatus;
* Print statistics about path in specified format.
* Format sequences:
* %b: Size of file in blocks
* %g: Group name of owner
* %n: Filename
* %o: Block size
* %r: replication
* %u: User name of owner
* %y: UTC date as &quot;yyyy-MM-dd HH:mm:ss&quot;
* %Y: Milliseconds since January 1, 1970 UTC
*/
@ -50,8 +52,8 @@ class Stat extends FsCommand {
public static final String USAGE = "[format] <path> ...";
public static final String DESCRIPTION =
"Print statistics about the file/directory at <path>\n" +
"in the specified format. Format accepts filesize in blocks (%b), filename (%n),\n" +
"block size (%o), replication (%r), modification date (%y, %Y)\n";
"in the specified format. Format accepts filesize in blocks (%b), group name of owner(%g),\n" +
"filename (%n), block size (%o), replication (%r), user name of owner(%u), modification date (%y, %Y)\n";
protected static final SimpleDateFormat timeFmt;
static {
@ -92,6 +94,9 @@ class Stat extends FsCommand {
? "directory"
: (stat.isFile() ? "regular file" : "symlink"));
break;
case 'g':
buf.append(stat.getGroup());
break;
case 'n':
buf.append(item.path.getName());
break;
@ -101,6 +106,9 @@ class Stat extends FsCommand {
case 'r':
buf.append(stat.getReplication());
break;
case 'u':
buf.append(stat.getOwner());
break;
case 'y':
buf.append(timeFmt.format(new Date(stat.getModificationTime())));
break;

View File

@ -49,7 +49,7 @@ public interface Decompressor {
public void setInput(byte[] b, int off, int len);
/**
* Returns true if the input data buffer is empty and
* Returns <code>true</code> if the input data buffer is empty and
* {@link #setInput(byte[], int, int)} should be called to
* provide more input.
*
@ -76,8 +76,11 @@ public interface Decompressor {
public boolean needsDictionary();
/**
* Returns true if the end of the decompressed
* data output stream has been reached.
* Returns <code>true</code> if the end of the decompressed
* data output stream has been reached. Indicates a concatenated data stream
* when finished() returns <code>true</code> and {@link #getRemaining()}
* returns a positive value. finished() will be reset with the
* {@link #reset()} method.
* @return <code>true</code> if the end of the decompressed
* data output stream has been reached.
*/
@ -98,15 +101,23 @@ public interface Decompressor {
public int decompress(byte[] b, int off, int len) throws IOException;
/**
* Returns the number of bytes remaining in the compressed-data buffer;
* typically called after the decompressor has finished decompressing
* the current gzip stream (a.k.a. "member").
* Returns the number of bytes remaining in the compressed data buffer.
* Indicates a concatenated data stream if {@link #finished()} returns
* <code>true</code> and getRemaining() returns a positive value. If
* {@link #finished()} returns <code>true</code> and getRemaining() returns
* a zero value, indicates that the end of data stream has been reached and
* is not a concatenated data stream.
* @return The number of bytes remaining in the compressed data buffer.
*/
public int getRemaining();
/**
* Resets decompressor and input and output buffers so that a new set of
* input data can be processed.
* input data can be processed. If {@link #finished()}} returns
* <code>true</code> and {@link #getRemaining()} returns a positive value,
* reset() is called before processing of the next data stream in the
* concatenated data stream. {@link #finished()} will be reset and will
* return <code>false</code> when reset() is called.
*/
public void reset();

View File

@ -80,6 +80,7 @@ public class UserGroupInformation {
* Percentage of the ticket window to use before we renew ticket.
*/
private static final float TICKET_RENEW_WINDOW = 0.80f;
static final String HADOOP_USER_NAME = "HADOOP_USER_NAME";
/**
* UgiMetrics maintains UGI activity statistics
@ -137,7 +138,16 @@ public class UserGroupInformation {
LOG.debug("using kerberos user:"+user);
}
}
// if we don't have a kerberos user, use the OS user
//If we don't have a kerberos user and security is disabled, check
//if user is specified in the environment or properties
if (!isSecurityEnabled() && (user == null)) {
String envUser = System.getenv(HADOOP_USER_NAME);
if (envUser == null) {
envUser = System.getProperty(HADOOP_USER_NAME);
}
user = envUser == null ? null : new User(envUser);
}
// use the OS user
if (user == null) {
user = getCanonicalUser(OS_PRINCIPAL_CLASS);
if (LOG.isDebugEnabled()) {

View File

@ -134,20 +134,6 @@
</description>
</property>
<!--- logging properties -->
<property>
<name>hadoop.logfile.size</name>
<value>10000000</value>
<description>The max size of each log file</description>
</property>
<property>
<name>hadoop.logfile.count</name>
<value>10</value>
<description>The max number of log files</description>
</property>
<!-- i/o properties -->
<property>
<name>io.file.buffer.size</name>

View File

@ -451,6 +451,9 @@ public class TestConfiguration extends TestCase {
appendProperty("test.bool3", " true ");
appendProperty("test.bool4", " false ");
appendProperty("test.bool5", "foo");
appendProperty("test.bool6", "TRUE");
appendProperty("test.bool7", "FALSE");
appendProperty("test.bool8", "");
endConfig();
Path fileResource = new Path(CONFIG);
conf.addResource(fileResource);
@ -459,6 +462,9 @@ public class TestConfiguration extends TestCase {
assertEquals(true, conf.getBoolean("test.bool3", false));
assertEquals(false, conf.getBoolean("test.bool4", true));
assertEquals(true, conf.getBoolean("test.bool5", true));
assertEquals(true, conf.getBoolean("test.bool6", false));
assertEquals(false, conf.getBoolean("test.bool7", true));
assertEquals(false, conf.getBoolean("test.bool8", false));
}
public void testFloatValues() throws IOException {

View File

@ -0,0 +1,32 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.hadoop.security;
import java.io.IOException;
import org.junit.Assert;
import org.junit.Test;
public class TestUserFromEnv {
@Test
public void testUserFromEnvironment() throws IOException {
System.setProperty(UserGroupInformation.HADOOP_USER_NAME, "randomUser");
Assert.assertEquals("randomUser", UserGroupInformation.getLoginUser()
.getUserName());
}
}

View File

@ -610,11 +610,11 @@
</comparator>
<comparator>
<type>RegexpComparator</type>
<expected-output>^( |\t)*in the specified format. Format accepts filesize in blocks \(%b\), filename \(%n\),( )*</expected-output>
<expected-output>^( |\t)*in the specified format. Format accepts filesize in blocks \(%b\), group name of owner\(%g\),( )*</expected-output>
</comparator>
<comparator>
<type>RegexpComparator</type>
<expected-output>^( |\t)*block size \(%o\), replication \(%r\), modification date \(%y, %Y\)( )*</expected-output>
<expected-output>^( |\t)*filename \(%n\), block size \(%o\), replication \(%r\), user name of owner\(%u\), modification date \(%y, %Y\)( )*</expected-output>
</comparator>
</comparators>
</test>

View File

@ -18,4 +18,4 @@
OK_RELEASEAUDIT_WARNINGS=0
OK_FINDBUGS_WARNINGS=0
OK_JAVADOC_WARNINGS=2
OK_JAVADOC_WARNINGS=0

View File

@ -53,6 +53,11 @@
<artifactId>mockito-all</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-server</artifactId>

View File

@ -219,7 +219,7 @@ public class HttpFSServer {
* operation is @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.GetOpValues#LISTSTATUS}
* @param doAs user being impersonated, defualt value is none. It can be used
* only if the current user is a HttpFSServer proxyuser.
* @param override, default is true. Used only for
* @param override default is true. Used only for
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
* @param blockSize block size to set, used only by
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
@ -419,7 +419,7 @@ public class HttpFSServer {
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
* @param group group to set, used only for
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
* @param override, default is true. Used only for
* @param override default is true. Used only for
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
* @param blockSize block size to set, used only by
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.

View File

@ -201,6 +201,10 @@ Release 0.23.1 - UNRELEASED
HDFS-2817. Combine the two TestSafeMode test suites. (todd)
HDFS-2818. Fix a missing space issue in HDFS webapps' title tags. (Devaraj K via harsh)
HDFS-2397. Undeprecate SecondaryNameNode (eli)
OPTIMIZATIONS
HDFS-2130. Switch default checksum to CRC32C. (todd)
@ -215,6 +219,12 @@ Release 0.23.1 - UNRELEASED
for a client on the same node as the block file. (Andrew Purtell,
Suresh Srinivas and Jitendra Nath Pandey via szetszwo)
HDFS-2825. Add test hook to turn off the writer preferring its local
DN. (todd)
HDFS-2826. Add test case for HDFS-1476 (safemode can initialize
replication queues before exiting) (todd)
BUG FIXES
HDFS-2541. For a sufficiently large value of blocks, the DN Scanner
@ -276,6 +286,15 @@ Release 0.23.1 - UNRELEASED
HDFS-2816. Fix missing license header in httpfs findbugsExcludeFile.xml.
(hitesh via tucu)
HDFS-2822. processMisReplicatedBlock incorrectly identifies
under-construction blocks as under-replicated. (todd)
HDFS-442. dfsthroughput in test jar throws NPE (harsh)
HDFS-2836. HttpFSServer still has 2 javadoc warnings in trunk (revans2 via tucu)
HDFS-2837. mvn javadoc:javadoc not seeing LimitedPrivate class (revans2 via tucu)
Release 0.23.0 - 2011-11-01
INCOMPATIBLE CHANGES

View File

@ -112,17 +112,18 @@
problems.
</li>
<li>
Secondary NameNode (deprecated): performs periodic checkpoints of the
Secondary NameNode: performs periodic checkpoints of the
namespace and helps keep the size of file containing log of HDFS
modifications within certain limits at the NameNode.
Replaced by Checkpoint node.
</li>
<li>
Checkpoint node: performs periodic checkpoints of the namespace and
helps minimize the size of the log stored at the NameNode
containing changes to the HDFS.
Replaces the role previously filled by the Secondary NameNode.
NameNode allows multiple Checkpoint nodes simultaneously,
Replaces the role previously filled by the Secondary NameNode,
though is not yet battle hardened.
The NameNode allows multiple Checkpoint nodes simultaneously,
as long as there are no Backup nodes registered with the system.
</li>
<li>
@ -132,6 +133,7 @@
which is always in sync with the active NameNode namespace state.
Only one Backup node may be registered with the NameNode at once.
</li>
</ul>
</li>
</ul>
@ -234,12 +236,6 @@
</section>
<section> <title>Secondary NameNode</title>
<note>
The Secondary NameNode has been deprecated.
Instead, consider using the
<a href="hdfs_user_guide.html#Checkpoint+node">Checkpoint Node</a> or
<a href="hdfs_user_guide.html#Backup+node">Backup Node</a>.
</note>
<p>
The NameNode stores modifications to the file system as a log
appended to a native file system file, <code>edits</code>.
@ -287,7 +283,9 @@
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#secondarynamenode">secondarynamenode</a>.
</p>
</section><section> <title> Checkpoint Node </title>
</section>
<section> <title> Checkpoint Node </title>
<p>NameNode persists its namespace using two files: <code>fsimage</code>,
which is the latest checkpoint of the namespace and <code>edits</code>,
a journal (log) of changes to the namespace since the checkpoint.

View File

@ -1793,7 +1793,8 @@ public class BlockManager {
public void processMisReplicatedBlocks() {
assert namesystem.hasWriteLock();
long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0;
long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0,
nrUnderConstruction = 0;
neededReplications.clear();
for (BlockInfo block : blocksMap.getBlocks()) {
INodeFile fileINode = block.getINode();
@ -1803,6 +1804,12 @@ public class BlockManager {
addToInvalidates(block);
continue;
}
if (!block.isComplete()) {
// Incomplete blocks are never considered mis-replicated --
// they'll be reached when they are completed or recovered.
nrUnderConstruction++;
continue;
}
// calculate current replication
short expectedReplication = fileINode.getReplication();
NumberReplicas num = countNodes(block);
@ -1826,6 +1833,7 @@ public class BlockManager {
LOG.info("Number of invalid blocks = " + nrInvalid);
LOG.info("Number of under-replicated blocks = " + nrUnderReplicated);
LOG.info("Number of over-replicated blocks = " + nrOverReplicated);
LOG.info("Number of blocks being written = " + nrUnderConstruction);
}
/** Set replication for the blocks. */

View File

@ -38,6 +38,8 @@ import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import com.google.common.annotations.VisibleForTesting;
/** The class is responsible for choosing the desired number of targets
* for placing block replicas.
* The replica placement strategy is that if the writer is on a datanode,
@ -49,6 +51,7 @@ import org.apache.hadoop.net.NodeBase;
@InterfaceAudience.Private
public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
private boolean considerLoad;
private boolean preferLocalNode = true;
private NetworkTopology clusterMap;
private FSClusterStats stats;
static final String enableDebugLogging = "For more information, please enable"
@ -223,7 +226,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
if (localMachine == null)
return chooseRandom(NodeBase.ROOT, excludedNodes,
blocksize, maxNodesPerRack, results);
if (preferLocalNode) {
// otherwise try local machine first
Node oldNode = excludedNodes.put(localMachine, localMachine);
if (oldNode == null) { // was not in the excluded list
@ -233,7 +236,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
return localMachine;
}
}
}
// try a node on local rack
return chooseLocalRack(localMachine, excludedNodes,
blocksize, maxNodesPerRack, results);
@ -568,5 +571,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
}
return cur;
}
@VisibleForTesting
void setPreferLocalNode(boolean prefer) {
this.preferLocalNode = prefer;
}
}

View File

@ -172,6 +172,7 @@ import org.apache.hadoop.util.VersionInfo;
import org.mortbay.util.ajax.JSON;
import com.google.common.base.Preconditions;
import com.google.common.annotations.VisibleForTesting;
/***************************************************
* FSNamesystem does the actual bookkeeping work for the
@ -2842,7 +2843,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
/** Total number of blocks. */
int blockTotal;
/** Number of safe blocks. */
private int blockSafe;
int blockSafe;
/** Number of blocks needed to satisfy safe mode threshold condition */
private int blockThreshold;
/** Number of blocks needed before populating replication queues */
@ -2850,7 +2851,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
/** time of the last status printout */
private long lastStatusReport = 0;
/** flag indicating whether replication queues have been initialized */
private boolean initializedReplQueues = false;
boolean initializedReplQueues = false;
/** Was safemode entered automatically because available resources were low. */
private boolean resourcesLow = false;
@ -2980,9 +2981,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
*/
private synchronized void initializeReplQueues() {
LOG.info("initializing replication queues");
if (isPopulatingReplQueues()) {
LOG.warn("Replication queues already initialized.");
}
assert !isPopulatingReplQueues() : "Already initialized repl queues";
long startTimeMisReplicatedScan = now();
blockManager.processMisReplicatedBlocks();
initializedReplQueues = true;
@ -4412,4 +4411,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
byte[] password) throws InvalidToken {
getDelegationTokenSecretManager().verifyToken(identifier, password);
}
@VisibleForTesting
public SafeModeInfo getSafeModeInfoForTests() {
return safeMode;
}
}

View File

@ -87,7 +87,6 @@ import com.google.common.collect.ImmutableList;
* primary NameNode.
*
**********************************************************/
@Deprecated // use BackupNode with -checkpoint argument instead.
@InterfaceAudience.Private
public class SecondaryNameNode implements Runnable {

View File

@ -41,7 +41,7 @@
<!DOCTYPE html>
<html>
<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
<title>Hadoop <%=namenodeRole%>&nbsp;<%=namenodeLabel%></title>
<body>
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
<%=NamenodeJspHelper.getVersionTable(fsn)%>

View File

@ -37,7 +37,7 @@
<html>
<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
<title>Hadoop <%=namenodeRole%>&nbsp;<%=namenodeLabel%></title>
<body>
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>

View File

@ -37,7 +37,7 @@ String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameN
<html>
<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
<title>Hadoop <%=namenodeRole%>&nbsp;<%=namenodeLabel%></title>
<body>
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>

View File

@ -193,6 +193,10 @@ public class BenchmarkThroughput extends Configured implements Tool {
BUFFER_SIZE = conf.getInt("dfsthroughput.buffer.size", 4 * 1024);
String localDir = conf.get("mapred.temp.dir");
if (localDir == null) {
localDir = conf.get("hadoop.tmp.dir");
conf.set("mapred.temp.dir", localDir);
}
dir = new LocalDirAllocator("mapred.temp.dir");
System.setProperty("test.build.data", localDir);

View File

@ -19,22 +19,37 @@
package org.apache.hadoop.hdfs;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.test.GenericTestUtils;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.After;
import org.junit.Test;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
/**
* Tests to verify safe mode correctness.
*/
public class TestSafeMode {
private static final Path TEST_PATH = new Path("/test");
private static final int BLOCK_SIZE = 1024;
Configuration conf;
MiniDFSCluster cluster;
FileSystem fs;
@ -43,6 +58,7 @@ public class TestSafeMode {
@Before
public void startUp() throws IOException {
conf = new HdfsConfiguration();
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
fs = cluster.getFileSystem();
@ -83,7 +99,7 @@ public class TestSafeMode {
// create two files with one block each.
DFSTestUtil.createFile(fs, file1, 1000, (short)1, 0);
DFSTestUtil.createFile(fs, file2, 2000, (short)1, 0);
DFSTestUtil.createFile(fs, file2, 1000, (short)1, 0);
fs.close();
cluster.shutdown();
@ -128,6 +144,106 @@ public class TestSafeMode {
assertEquals("", status);
}
/**
* Test that the NN initializes its under-replicated blocks queue
* before it is ready to exit safemode (HDFS-1476)
*/
@Test(timeout=45000)
public void testInitializeReplQueuesEarly() throws Exception {
// Spray the blocks around the cluster when we add DNs instead of
// concentrating all blocks on the first node.
BlockManagerTestUtil.setWritingPrefersLocalNode(
cluster.getNamesystem().getBlockManager(), false);
cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null);
cluster.waitActive();
DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L);
List<DataNodeProperties> dnprops = Lists.newLinkedList();
dnprops.add(cluster.stopDataNode(0));
dnprops.add(cluster.stopDataNode(0));
dnprops.add(cluster.stopDataNode(0));
cluster.getConfiguration(0).setFloat(
DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f);
cluster.restartNameNode();
final NameNode nn = cluster.getNameNode();
String status = nn.getNamesystem().getSafemode();
assertEquals("Safe mode is ON.The reported blocks 0 needs additional " +
"15 blocks to reach the threshold 0.9990 of total blocks 15. " +
"Safe mode will be turned off automatically.", status);
assertFalse("Mis-replicated block queues should not be initialized " +
"until threshold is crossed",
NameNodeAdapter.safeModeInitializedReplQueues(nn));
cluster.restartDataNode(dnprops.remove(0));
// Wait for the block report from the restarted DN to come in.
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0;
}
}, 10, 10000);
// SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks
// call is still going on at this point - wait until it's done by grabbing
// the lock.
nn.getNamesystem().writeLock();
nn.getNamesystem().writeUnlock();
int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn);
assertTrue("Expected first block report to make some but not all blocks " +
"safe. Got: " + safe, safe >= 1 && safe < 15);
BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager());
assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn));
assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks());
cluster.restartDataNodes();
}
/**
* Test that, when under-replicated blocks are processed at the end of
* safe-mode, blocks currently under construction are not considered
* under-construction or missing. Regression test for HDFS-2822.
*/
@Test
public void testRbwBlocksNotConsideredUnderReplicated() throws IOException {
List<FSDataOutputStream> stms = Lists.newArrayList();
try {
// Create some junk blocks so that the NN doesn't just immediately
// exit safemode on restart.
DFSTestUtil.createFile(fs, new Path("/junk-blocks"),
BLOCK_SIZE*4, (short)1, 1L);
// Create several files which are left open. It's important to
// create several here, because otherwise the first iteration of the
// replication monitor will pull them off the replication queue and
// hide this bug from the test!
for (int i = 0; i < 10; i++) {
FSDataOutputStream stm = fs.create(
new Path("/append-" + i), true, BLOCK_SIZE, (short) 1, BLOCK_SIZE);
stms.add(stm);
stm.write(1);
stm.hflush();
}
cluster.restartNameNode();
FSNamesystem ns = cluster.getNameNode(0).getNamesystem();
BlockManagerTestUtil.updateState(ns.getBlockManager());
assertEquals(0, ns.getPendingReplicationBlocks());
assertEquals(0, ns.getCorruptReplicaBlocks());
assertEquals(0, ns.getMissingBlocksCount());
} finally {
for (FSDataOutputStream stm : stms) {
IOUtils.closeStream(stm);
}
cluster.shutdown();
}
}
public interface FSRun {
public abstract void run(FileSystem fs) throws IOException;
}

View File

@ -27,6 +27,8 @@ import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.util.Daemon;
import com.google.common.base.Preconditions;
public class BlockManagerTestUtil {
public static void setNodeReplicationLimit(final BlockManager blockManager,
final int limit) {
@ -122,4 +124,17 @@ public class BlockManagerTestUtil {
return blockManager.computeDatanodeWork();
}
/**
* Change whether the block placement policy will prefer the writer's
* local Datanode or not.
* @param prefer
*/
public static void setWritingPrefersLocalNode(
BlockManager bm, boolean prefer) {
BlockPlacementPolicy bpp = bm.getBlockPlacementPolicy();
Preconditions.checkState(bpp instanceof BlockPlacementPolicyDefault,
"Must use default policy, got %s", bpp.getClass());
((BlockPlacementPolicyDefault)bpp).setPreferLocalNode(prefer);
}
}

View File

@ -24,6 +24,7 @@ import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.SafeModeInfo;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.ipc.Server;
@ -97,4 +98,28 @@ public class NameNodeAdapter {
ns.readUnlock();
}
}
/**
* @return the number of blocks marked safe by safemode, or -1
* if safemode is not running.
*/
public static int getSafeModeSafeBlocks(NameNode nn) {
SafeModeInfo smi = nn.getNamesystem().getSafeModeInfoForTests();
if (smi == null) {
return -1;
}
return smi.blockSafe;
}
/**
* @return true if safemode is not running, or if safemode has already
* initialized the replication queues
*/
public static boolean safeModeInitializedReplQueues(NameNode nn) {
SafeModeInfo smi = nn.getNamesystem().getSafeModeInfoForTests();
if (smi == null) {
return true;
}
return smi.initializedReplQueues;
}
}

View File

@ -203,7 +203,6 @@ public class TestCheckpoint extends TestCase {
/*
* Simulate namenode crashing after rolling edit log.
*/
@SuppressWarnings("deprecation")
public void testSecondaryNamenodeError1()
throws IOException {
LOG.info("Starting testSecondaryNamenodeError1");
@ -265,7 +264,6 @@ public class TestCheckpoint extends TestCase {
/*
* Simulate a namenode crash after uploading new image
*/
@SuppressWarnings("deprecation")
public void testSecondaryNamenodeError2() throws IOException {
LOG.info("Starting testSecondaryNamenodeError2");
Configuration conf = new HdfsConfiguration();
@ -324,7 +322,6 @@ public class TestCheckpoint extends TestCase {
/*
* Simulate a secondary namenode crash after rolling the edit log.
*/
@SuppressWarnings("deprecation")
public void testSecondaryNamenodeError3() throws IOException {
LOG.info("Starting testSecondaryNamenodeError3");
Configuration conf = new HdfsConfiguration();
@ -394,7 +391,6 @@ public class TestCheckpoint extends TestCase {
* back to the name-node.
* Used to truncate primary fsimage file.
*/
@SuppressWarnings("deprecation")
public void testSecondaryFailsToReturnImage() throws IOException {
LOG.info("Starting testSecondaryFailsToReturnImage");
Configuration conf = new HdfsConfiguration();
@ -471,7 +467,6 @@ public class TestCheckpoint extends TestCase {
* @param errorType the ErrorSimulator type to trigger
* @param exceptionSubstring an expected substring of the triggered exception
*/
@SuppressWarnings("deprecation")
private void doSendFailTest(int errorType, String exceptionSubstring)
throws IOException {
Configuration conf = new HdfsConfiguration();
@ -586,7 +581,6 @@ public class TestCheckpoint extends TestCase {
/**
* Test that the SecondaryNameNode properly locks its storage directories.
*/
@SuppressWarnings("deprecation")
public void testSecondaryNameNodeLocking() throws Exception {
// Start a primary NN so that the secondary will start successfully
Configuration conf = new HdfsConfiguration();
@ -679,7 +673,6 @@ public class TestCheckpoint extends TestCase {
* 2. if the NN does not contain an image, importing a checkpoint
* succeeds and re-saves the image
*/
@SuppressWarnings("deprecation")
public void testImportCheckpoint() throws Exception {
Configuration conf = new HdfsConfiguration();
Path testPath = new Path("/testfile");
@ -760,16 +753,12 @@ public class TestCheckpoint extends TestCase {
throw new IOException("Cannot create directory " + dir);
}
// This deprecation suppress warning does not work due to known Java bug:
// http://bugs.sun.com/view_bug.do?bug_id=6460147
@SuppressWarnings("deprecation")
SecondaryNameNode startSecondaryNameNode(Configuration conf
) throws IOException {
conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, "0.0.0.0:0");
return new SecondaryNameNode(conf);
}
@SuppressWarnings("deprecation")
SecondaryNameNode startSecondaryNameNode(Configuration conf, int index)
throws IOException {
Configuration snnConf = new Configuration(conf);
@ -782,7 +771,6 @@ public class TestCheckpoint extends TestCase {
/**
* Tests checkpoint in HDFS.
*/
@SuppressWarnings("deprecation")
public void testCheckpoint() throws IOException {
Path file1 = new Path("checkpoint.dat");
Path file2 = new Path("checkpoint2.dat");
@ -1009,7 +997,6 @@ public class TestCheckpoint extends TestCase {
* - it then fails again for the same reason
* - it then tries to checkpoint a third time
*/
@SuppressWarnings("deprecation")
public void testCheckpointAfterTwoFailedUploads() throws IOException {
MiniDFSCluster cluster = null;
SecondaryNameNode secondary = null;
@ -1064,7 +1051,6 @@ public class TestCheckpoint extends TestCase {
*
* @throws IOException
*/
@SuppressWarnings("deprecation")
public void testMultipleSecondaryNamenodes() throws IOException {
Configuration conf = new HdfsConfiguration();
String nameserviceId1 = "ns1";
@ -1114,7 +1100,6 @@ public class TestCheckpoint extends TestCase {
* Test that the secondary doesn't have to re-download image
* if it hasn't changed.
*/
@SuppressWarnings("deprecation")
public void testSecondaryImageDownload() throws IOException {
LOG.info("Starting testSecondaryImageDownload");
Configuration conf = new HdfsConfiguration();
@ -1197,7 +1182,6 @@ public class TestCheckpoint extends TestCase {
* It verifies that this works even though the earlier-txid checkpoint gets
* uploaded after the later-txid checkpoint.
*/
@SuppressWarnings("deprecation")
public void testMultipleSecondaryNNsAgainstSameNN() throws Exception {
Configuration conf = new HdfsConfiguration();
@ -1283,7 +1267,6 @@ public class TestCheckpoint extends TestCase {
* It verifies that one of the two gets an error that it's uploading a
* duplicate checkpoint, and the other one succeeds.
*/
@SuppressWarnings("deprecation")
public void testMultipleSecondaryNNsAgainstSameNN2() throws Exception {
Configuration conf = new HdfsConfiguration();
@ -1382,7 +1365,6 @@ public class TestCheckpoint extends TestCase {
* is running. The secondary should shut itself down if if talks to a NN
* with the wrong namespace.
*/
@SuppressWarnings("deprecation")
public void testReformatNNBetweenCheckpoints() throws IOException {
MiniDFSCluster cluster = null;
SecondaryNameNode secondary = null;
@ -1637,7 +1619,6 @@ public class TestCheckpoint extends TestCase {
/**
* Test that the 2NN triggers a checkpoint after the configurable interval
*/
@SuppressWarnings("deprecation")
public void testCheckpointTriggerOnTxnCount() throws Exception {
MiniDFSCluster cluster = null;
SecondaryNameNode secondary = null;
@ -1691,7 +1672,6 @@ public class TestCheckpoint extends TestCase {
* logs that connect the 2NN's old checkpoint to the current txid
* get archived. Then, the 2NN tries to checkpoint again.
*/
@SuppressWarnings("deprecation")
public void testSecondaryHasVeryOutOfDateImage() throws IOException {
MiniDFSCluster cluster = null;
SecondaryNameNode secondary = null;
@ -1729,7 +1709,6 @@ public class TestCheckpoint extends TestCase {
}
}
@SuppressWarnings("deprecation")
public void testCommandLineParsing() throws ParseException {
SecondaryNameNode.CommandLineOpts opts =
new SecondaryNameNode.CommandLineOpts();
@ -1764,7 +1743,6 @@ public class TestCheckpoint extends TestCase {
} catch (ParseException e) {}
}
@SuppressWarnings("deprecation")
private void cleanup(SecondaryNameNode snn) {
if (snn != null) {
try {
@ -1780,7 +1758,6 @@ public class TestCheckpoint extends TestCase {
* Assert that if any two files have the same name across the 2NNs
* and NN, they should have the same content too.
*/
@SuppressWarnings("deprecation")
private void assertParallelFilesInvariant(MiniDFSCluster cluster,
ImmutableList<SecondaryNameNode> secondaries) throws Exception {
List<File> allCurrentDirs = Lists.newArrayList();
@ -1792,7 +1769,6 @@ public class TestCheckpoint extends TestCase {
ImmutableSet.of("VERSION"));
}
@SuppressWarnings("deprecation")
private List<File> getCheckpointCurrentDirs(SecondaryNameNode secondary) {
List<File> ret = Lists.newArrayList();
for (URI u : secondary.getCheckpointDirs()) {
@ -1802,7 +1778,6 @@ public class TestCheckpoint extends TestCase {
return ret;
}
@SuppressWarnings("deprecation")
private CheckpointStorage spyOnSecondaryImage(SecondaryNameNode secondary1) {
CheckpointStorage spy = Mockito.spy((CheckpointStorage)secondary1.getFSImage());;
secondary1.setFSImage(spy);
@ -1812,7 +1787,6 @@ public class TestCheckpoint extends TestCase {
/**
* A utility class to perform a checkpoint in a different thread.
*/
@SuppressWarnings("deprecation")
private static class DoCheckpointThread extends Thread {
private final SecondaryNameNode snn;
private volatile Throwable thrown = null;

View File

@ -106,9 +106,6 @@ public class TestNameEditsConfigs extends TestCase {
assertTrue(!fileSys.exists(name));
}
// This deprecation suppress warning does not work due to known Java bug:
// http://bugs.sun.com/view_bug.do?bug_id=6460147
@SuppressWarnings("deprecation")
SecondaryNameNode startSecondaryNameNode(Configuration conf
) throws IOException {
conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, "0.0.0.0:0");
@ -128,7 +125,6 @@ public class TestNameEditsConfigs extends TestCase {
* sure we are reading proper edits and image.
* @throws Exception
*/
@SuppressWarnings("deprecation")
public void testNameEditsConfigs() throws Exception {
Path file1 = new Path("TestNameEditsConfigs1");
Path file2 = new Path("TestNameEditsConfigs2");

View File

@ -30,7 +30,6 @@ import org.junit.Test;
public class TestSecondaryWebUi {
@SuppressWarnings("deprecation")
@Test
public void testSecondaryWebUi() throws IOException {
Configuration conf = new Configuration();

View File

@ -120,7 +120,6 @@ public class TestStartup extends TestCase {
* start MiniDFScluster, create a file (to create edits) and do a checkpoint
* @throws IOException
*/
@SuppressWarnings("deprecation")
public void createCheckPoint() throws IOException {
LOG.info("--starting mini cluster");
// manage dirs parameter set to false
@ -300,7 +299,6 @@ public class TestStartup extends TestCase {
* secondary node copies fsimage and edits into correct separate directories.
* @throws IOException
*/
@SuppressWarnings("deprecation")
public void testSNNStartup() throws IOException{
//setUpConfig();
LOG.info("--starting SecondNN startup test");

View File

@ -153,7 +153,6 @@ public class TestStorageRestore {
* 7. run doCheckpoint
* 8. verify that all the image and edits files are the same.
*/
@SuppressWarnings("deprecation")
@Test
public void testStorageRestore() throws Exception {
int numDatanodes = 0;
@ -310,7 +309,6 @@ public class TestStorageRestore {
* then try to perform a checkpoint. The NN should not serve up the image or
* edits from the restored (empty) dir.
*/
@SuppressWarnings("deprecation")
@Test
public void testMultipleSecondaryCheckpoint() throws IOException {

View File

@ -142,6 +142,14 @@ Release 0.23.1 - Unreleased
MAPREDUCE-3692. yarn-resourcemanager out and log files can get big. (eli)
MAPREDUCE-3710. Improved FileInputFormat to return better locality for the
last split. (Siddarth Seth via vinodkv)
MAPREDUCE-2765. DistCp Rewrite. (Mithun Radhakrishnan via mahadev)
MAPREDUCE-3737. The Web Application Proxy's is not documented very well.
(Robert Evans via mahadev)
OPTIMIZATIONS
MAPREDUCE-3567. Extraneous JobConf objects in AM heap. (Vinod Kumar
@ -165,7 +173,13 @@ Release 0.23.1 - Unreleased
MAPREDUCE-3512. Batching JobHistory flushing to DFS so that we don't flush
for every event slowing down AM. (Siddarth Seth via vinodkv)
MAPREDUCE-3718. Change default AM heartbeat interval to 1 second. (Hitesh
Shah via sseth)
BUG FIXES
MAPREDUCE-3194. "mapred mradmin" command is broken in mrv2
(Jason Lowe via bobby)
MAPREDUCE-3462. Fix Gridmix JUnit testcase failures.
(Ravi Prakash and Ravi Gummadi via amarrk)
@ -499,6 +513,48 @@ Release 0.23.1 - Unreleased
MAPREDUCE-3705. ant build fails on 0.23 branch. (Thomas Graves via
mahadev)
MAPREDUCE-3691. webservices add support to compress response.
(Thomas Graves via mahadev)
MAPREDUCE-3702. internal server error trying access application master
via proxy with filter enabled (Thomas Graves via mahadev)
MAPREDUCE-3646. Remove redundant URL info from "mapred job" output.
(Jonathan Eagles via mahadev)
MAPREDUCE-3681. Fixed computation of queue's usedCapacity. (acmurthy)
MAPREDUCE-3505. yarn APPLICATION_CLASSPATH needs to be overridable.
(ahmed via tucu)
MAPREDUCE-3714. Fixed EventFetcher and Fetcher threads to shut-down properly
so that reducers don't hang in corner cases. (vinodkv)
MAPREDUCE-3712. The mapreduce tar does not contain the hadoop-mapreduce-client-
jobclient-tests.jar. (mahadev)
MAPREDUCE-3717. JobClient test jar has missing files to run all the test programs.
(mahadev)
MAPREDUCE-3630. Fixes a NullPointer exception while running TeraGen - if a
map is asked to generate 0 records. (Mahadev Konar via sseth)
MAPREDUCE-3683. Fixed maxCapacity of queues to be product of parent
maxCapacities. (acmurthy)
MAPREDUCE-3713. Fixed the way head-room is allocated to applications by
CapacityScheduler so that it deducts current-usage per user and not
per-application. (Arun C Murthy via vinodkv)
MAPREDUCE-3721. Fixed a race in shuffle which caused reduces to hang.
(sseth via acmurthy)
MAPREDUCE-3733. Add Apache License Header to hadoop-distcp/pom.xml.
(mahadev)
MAPREDUCE-3735. Add distcp jar to the distribution (tar).
(mahadev)
Release 0.23.0 - 2011-11-01
INCOMPATIBLE CHANGES

View File

@ -30,9 +30,6 @@ fi
function print_usage(){
echo "Usage: mapred [--config confdir] COMMAND"
echo " where COMMAND is one of:"
echo " mradmin run a Map-Reduce admin client"
echo " jobtracker run the MapReduce job Tracker node"
echo " tasktracker run a MapReduce task Tracker node"
echo " pipes run a Pipes job"
echo " job manipulate MapReduce jobs"
echo " queue get information regarding JobQueues"
@ -51,16 +48,7 @@ fi
COMMAND=$1
shift
if [ "$COMMAND" = "mradmin" ] ; then
CLASS=org.apache.hadoop.mapred.tools.MRAdmin
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
elif [ "$COMMAND" = "jobtracker" ] ; then
CLASS=org.apache.hadoop.mapred.JobTracker
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
elif [ "$COMMAND" = "tasktracker" ] ; then
CLASS=org.apache.hadoop.mapred.TaskTracker
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
elif [ "$COMMAND" = "job" ] ; then
if [ "$COMMAND" = "job" ] ; then
CLASS=org.apache.hadoop.mapred.JobClient
elif [ "$COMMAND" = "queue" ] ; then
CLASS=org.apache.hadoop.mapred.JobQueueClient
@ -75,6 +63,13 @@ elif [ "$COMMAND" = "classpath" ] ; then
elif [ "$COMMAND" = "groups" ] ; then
CLASS=org.apache.hadoop.mapred.tools.GetGroups
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
elif [ "$COMMAND" = "mradmin" ] \
|| [ "$COMMAND" = "jobtracker" ] \
|| [ "$COMMAND" = "tasktracker" ] ; then
echo "Sorry, the $COMMAND command is no longer supported."
echo "You may find similar functionality with the \"yarn\" shell command."
print_usage
exit
else
echo $COMMAND - invalid command
print_usage

View File

@ -522,13 +522,13 @@ public abstract class TaskAttemptImpl implements
* a parent CLC and use it for all the containers, so this should go away
* once the mr-generated-classpath stuff is gone.
*/
private static String getInitialClasspath() throws IOException {
private static String getInitialClasspath(Configuration conf) throws IOException {
synchronized (classpathLock) {
if (initialClasspathFlag.get()) {
return initialClasspath;
}
Map<String, String> env = new HashMap<String, String>();
MRApps.setClasspath(env);
MRApps.setClasspath(env, conf);
initialClasspath = env.get(Environment.CLASSPATH.name());
initialClasspathFlag.set(true);
return initialClasspath;
@ -631,7 +631,7 @@ public abstract class TaskAttemptImpl implements
Apps.addToEnvironment(
environment,
Environment.CLASSPATH.name(),
getInitialClasspath());
getInitialClasspath(conf));
} catch (IOException e) {
throw new YarnException(e);
}

View File

@ -38,6 +38,10 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-common</artifactId>
</dependency>
</dependencies>
<build>

View File

@ -54,6 +54,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.util.Apps;
import org.apache.hadoop.yarn.util.BuilderUtils;
@ -171,7 +172,7 @@ public class MRApps extends Apps {
}
private static void setMRFrameworkClasspath(
Map<String, String> environment) throws IOException {
Map<String, String> environment, Configuration conf) throws IOException {
InputStream classpathFileStream = null;
BufferedReader reader = null;
try {
@ -208,8 +209,10 @@ public class MRApps extends Apps {
}
// Add standard Hadoop classes
for (String c : ApplicationConstants.APPLICATION_CLASSPATH) {
Apps.addToEnvironment(environment, Environment.CLASSPATH.name(), c);
for (String c : conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH)
.split(",")) {
Apps.addToEnvironment(environment, Environment.CLASSPATH.name(), c
.trim());
}
} finally {
if (classpathFileStream != null) {
@ -222,8 +225,8 @@ public class MRApps extends Apps {
// TODO: Remove duplicates.
}
public static void setClasspath(Map<String, String> environment)
throws IOException {
public static void setClasspath(Map<String, String> environment,
Configuration conf) throws IOException {
Apps.addToEnvironment(
environment,
Environment.CLASSPATH.name(),
@ -232,7 +235,7 @@ public class MRApps extends Apps {
environment,
Environment.CLASSPATH.name(),
Environment.PWD.$() + Path.SEPARATOR + "*");
MRApps.setMRFrameworkClasspath(environment);
MRApps.setMRFrameworkClasspath(environment, conf);
}
private static final String STAGING_CONSTANT = ".staging";

View File

@ -18,7 +18,12 @@
package org.apache.hadoop.mapreduce.v2.util;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@ -121,4 +126,17 @@ public class TestMRApps {
"/my/path/to/staging/dummy-user/.staging/job_dummy-job_12345/job.xml", jobFile);
}
@Test public void testSetClasspath() throws IOException {
Job job = Job.getInstance();
Map<String, String> environment = new HashMap<String, String>();
MRApps.setClasspath(environment, job.getConfiguration());
assertEquals("job.jar:$PWD/*:$HADOOP_CONF_DIR:" +
"$HADOOP_COMMON_HOME/share/hadoop/common/*:" +
"$HADOOP_COMMON_HOME/share/hadoop/common/lib/*:" +
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/*:" +
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*:" +
"$YARN_HOME/share/hadoop/mapreduce/*:" +
"$YARN_HOME/share/hadoop/mapreduce/lib/*",
environment.get("CLASSPATH"));
}
}

View File

@ -289,8 +289,10 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
}
if (bytesRemaining != 0) {
splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
blkLocations[blkLocations.length-1].getHosts()));
String[] splitHosts = getSplitHosts(blkLocations, length
- bytesRemaining, bytesRemaining, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
splitHosts));
}
} else if (length != 0) {
String[] splitHosts = getSplitHosts(blkLocations,0,length,clusterMap);

View File

@ -1216,6 +1216,7 @@ public class Job extends JobContextImpl implements JobContext {
}
});
state = JobState.RUNNING;
LOG.info("The url to track the job: " + getTrackingURL());
}
/**

View File

@ -417,7 +417,7 @@ public interface MRJobConfig {
/** How often the AM should send heartbeats to the RM.*/
public static final String MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS =
MR_AM_PREFIX + "scheduler.heartbeat.interval-ms";
public static final int DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = 2000;
public static final int DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = 1000;
/**
* If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS

View File

@ -286,8 +286,9 @@ public abstract class FileInputFormat<K, V> extends InputFormat<K, V> {
}
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
blkLocations[blkLocations.length-1].getHosts()));
blkLocations[blkIndex].getHosts()));
}
} else { // not splitable
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));

View File

@ -27,6 +27,7 @@ import org.apache.hadoop.mapred.TaskCompletionEvent;
import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
import org.apache.hadoop.mapreduce.TaskAttemptID;
@SuppressWarnings("deprecation")
class EventFetcher<K,V> extends Thread {
private static final long SLEEP_TIME = 1000;
private static final int MAX_EVENTS_TO_FETCH = 10000;
@ -42,6 +43,8 @@ class EventFetcher<K,V> extends Thread {
private int maxMapRuntime = 0;
private volatile boolean stopped = false;
public EventFetcher(TaskAttemptID reduce,
TaskUmbilicalProtocol umbilical,
ShuffleScheduler<K,V> scheduler,
@ -60,7 +63,7 @@ class EventFetcher<K,V> extends Thread {
LOG.info(reduce + " Thread started: " + getName());
try {
while (true && !Thread.currentThread().isInterrupted()) {
while (!stopped && !Thread.currentThread().isInterrupted()) {
try {
int numNewMaps = getMapCompletionEvents();
failures = 0;
@ -71,6 +74,9 @@ class EventFetcher<K,V> extends Thread {
if (!Thread.currentThread().isInterrupted()) {
Thread.sleep(SLEEP_TIME);
}
} catch (InterruptedException e) {
LOG.info("EventFetcher is interrupted.. Returning");
return;
} catch (IOException ie) {
LOG.info("Exception in getting events", ie);
// check to see whether to abort
@ -91,6 +97,16 @@ class EventFetcher<K,V> extends Thread {
}
}
public void shutDown() {
this.stopped = true;
interrupt();
try {
join(5000);
} catch(InterruptedException ie) {
LOG.warn("Got interrupted while joining " + getName(), ie);
}
}
/**
* Queries the {@link TaskTracker} for a set of map-completion events
* from a given event ID.

View File

@ -48,6 +48,7 @@ import org.apache.hadoop.mapreduce.task.reduce.MapOutput.Type;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;
@SuppressWarnings({"deprecation"})
class Fetcher<K,V> extends Thread {
private static final Log LOG = LogFactory.getLog(Fetcher.class);
@ -88,6 +89,8 @@ class Fetcher<K,V> extends Thread {
private final Decompressor decompressor;
private final SecretKey jobTokenSecret;
private volatile boolean stopped = false;
public Fetcher(JobConf job, TaskAttemptID reduceId,
ShuffleScheduler<K,V> scheduler, MergeManager<K,V> merger,
Reporter reporter, ShuffleClientMetrics metrics,
@ -135,7 +138,7 @@ class Fetcher<K,V> extends Thread {
public void run() {
try {
while (true && !Thread.currentThread().isInterrupted()) {
while (!stopped && !Thread.currentThread().isInterrupted()) {
MapHost host = null;
try {
// If merge is on, block
@ -161,6 +164,16 @@ class Fetcher<K,V> extends Thread {
}
}
public void shutDown() throws InterruptedException {
this.stopped = true;
interrupt();
try {
join(5000);
} catch (InterruptedException ie) {
LOG.warn("Got interrupt while joining " + getName(), ie);
}
}
/**
* The crux of the matter...
*

View File

@ -92,6 +92,7 @@ public class MergeManager<K, V> {
private final long memoryLimit;
private long usedMemory;
private long commitMemory;
private final long maxSingleShuffleLimit;
private final int memToMemMergeOutputsThreshold;
@ -181,6 +182,13 @@ public class MergeManager<K, V> {
"ioSortFactor=" + ioSortFactor + ", " +
"memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold);
if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
throw new RuntimeException("Invlaid configuration: "
+ "maxSingleShuffleLimit should be less than mergeThreshold"
+ "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit
+ "mergeThreshold: " + this.mergeThreshold);
}
boolean allowMemToMemMerge =
jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
if (allowMemToMemMerge) {
@ -245,16 +253,16 @@ public class MergeManager<K, V> {
// all the stalled threads
if (usedMemory > memoryLimit) {
LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory +
") is greater than memoryLimit (" + memoryLimit + ")");
LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory
+ ") is greater than memoryLimit (" + memoryLimit + ")." +
" CommitMemory is (" + commitMemory + ")");
return stallShuffle;
}
// Allow the in-memory shuffle to progress
LOG.debug(mapId + ": Proceeding with shuffle since usedMemory (" +
usedMemory +
") is lesser than memoryLimit (" + memoryLimit + ")");
LOG.debug(mapId + ": Proceeding with shuffle since usedMemory ("
+ usedMemory + ") is lesser than memoryLimit (" + memoryLimit + ")."
+ "CommitMemory is (" + commitMemory + ")");
return unconditionalReserve(mapId, requestedSize, true);
}
@ -270,18 +278,24 @@ public class MergeManager<K, V> {
}
synchronized void unreserve(long size) {
commitMemory -= size;
usedMemory -= size;
}
public synchronized void closeInMemoryFile(MapOutput<K,V> mapOutput) {
inMemoryMapOutputs.add(mapOutput);
LOG.info("closeInMemoryFile -> map-output of size: " + mapOutput.getSize()
+ ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size());
+ ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size()
+ ", commitMemory -> " + commitMemory + ", usedMemory ->" + usedMemory);
commitMemory+= mapOutput.getSize();
synchronized (inMemoryMerger) {
if (!inMemoryMerger.isInProgress() && usedMemory >= mergeThreshold) {
LOG.info("Starting inMemoryMerger's merge since usedMemory=" +
usedMemory + " > mergeThreshold=" + mergeThreshold);
// Can hang if mergeThreshold is really low.
if (!inMemoryMerger.isInProgress() && commitMemory >= mergeThreshold) {
LOG.info("Starting inMemoryMerger's merge since commitMemory=" +
commitMemory + " > mergeThreshold=" + mergeThreshold +
". Current usedMemory=" + usedMemory);
inMemoryMapOutputs.addAll(inMemoryMergedMapOutputs);
inMemoryMergedMapOutputs.clear();
inMemoryMerger.startMerge(inMemoryMapOutputs);

View File

@ -19,8 +19,6 @@ package org.apache.hadoop.mapreduce.task.reduce;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.FileSystem;
@ -33,17 +31,17 @@ import org.apache.hadoop.mapred.RawKeyValueIterator;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Task;
import org.apache.hadoop.mapred.Task.CombineOutputCollector;
import org.apache.hadoop.mapred.TaskStatus;
import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
import org.apache.hadoop.mapred.Task.CombineOutputCollector;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.util.Progress;
@InterfaceAudience.Private
@InterfaceStability.Unstable
@SuppressWarnings({"deprecation", "unchecked", "rawtypes"})
public class Shuffle<K, V> implements ExceptionReporter {
private static final Log LOG = LogFactory.getLog(Shuffle.class);
private static final int PROGRESS_FREQUENCY = 2000;
private final TaskAttemptID reduceId;
@ -100,7 +98,6 @@ public class Shuffle<K, V> implements ExceptionReporter {
this, mergePhase, mapOutputFile);
}
@SuppressWarnings("unchecked")
public RawKeyValueIterator run() throws IOException, InterruptedException {
// Start the map-completion events fetcher thread
final EventFetcher<K,V> eventFetcher =
@ -130,19 +127,11 @@ public class Shuffle<K, V> implements ExceptionReporter {
}
// Stop the event-fetcher thread
eventFetcher.interrupt();
try {
eventFetcher.join();
} catch(Throwable t) {
LOG.info("Failed to stop " + eventFetcher.getName(), t);
}
eventFetcher.shutDown();
// Stop the map-output fetcher threads
for (Fetcher<K,V> fetcher : fetchers) {
fetcher.interrupt();
}
for (Fetcher<K,V> fetcher : fetchers) {
fetcher.join();
fetcher.shutDown();
}
fetchers = null;

View File

@ -102,6 +102,13 @@
<phase>test-compile</phase>
</execution>
</executions>
<configuration>
<archive>
<manifest>
<mainClass>org.apache.hadoop.test.MapredTestDriver</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>

View File

@ -175,7 +175,6 @@ public class ClientServiceDelegate {
+ ":" + addr.getPort()));
newUgi.addToken(clientToken);
}
LOG.info("The url to track the job: " + application.getTrackingUrl());
LOG.debug("Connecting to " + serviceAddr);
final String tempStr = serviceAddr;
realProxy = newUgi.doAs(new PrivilegedExceptionAction<MRClientProtocol>() {

View File

@ -406,7 +406,7 @@ public class YARNRunner implements ClientProtocol {
// Setup the CLASSPATH in environment
// i.e. add { job jar, CWD, Hadoop jars} to classpath.
Map<String, String> environment = new HashMap<String, String>();
MRApps.setClasspath(environment);
MRApps.setClasspath(environment, conf);
// Parse distributed cache
MRApps.setupDistributedCache(jobConf, localResources);

View File

@ -29,7 +29,6 @@ import java.util.Stack;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.examples.RandomTextWriter;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@ -40,6 +39,7 @@ import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.RandomTextWriter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;

View File

@ -17,6 +17,10 @@
*/
package org.apache.hadoop.mapred;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import java.io.DataOutputStream;
import java.io.IOException;
@ -32,6 +36,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.Text;
@SuppressWarnings("deprecation")
public class TestFileInputFormat extends TestCase {
Configuration conf = new Configuration();
@ -186,6 +191,102 @@ public class TestFileInputFormat extends TestCase {
assertEquals(splits.length, 2);
}
@SuppressWarnings("rawtypes")
public void testLastInputSplitAtSplitBoundary() throws Exception {
FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024,
128l * 1024 * 1024);
JobConf job = new JobConf();
InputSplit[] splits = fif.getSplits(job, 8);
assertEquals(8, splits.length);
for (int i = 0; i < splits.length; i++) {
InputSplit split = splits[i];
assertEquals(("host" + i), split.getLocations()[0]);
}
}
@SuppressWarnings("rawtypes")
public void testLastInputSplitExceedingSplitBoundary() throws Exception {
FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024,
128l * 1024 * 1024);
JobConf job = new JobConf();
InputSplit[] splits = fif.getSplits(job, 8);
assertEquals(8, splits.length);
for (int i = 0; i < splits.length; i++) {
InputSplit split = splits[i];
assertEquals(("host" + i), split.getLocations()[0]);
}
}
@SuppressWarnings("rawtypes")
public void testLastInputSplitSingleSplit() throws Exception {
FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024,
128l * 1024 * 1024);
JobConf job = new JobConf();
InputSplit[] splits = fif.getSplits(job, 1);
assertEquals(1, splits.length);
for (int i = 0; i < splits.length; i++) {
InputSplit split = splits[i];
assertEquals(("host" + i), split.getLocations()[0]);
}
}
private class FileInputFormatForTest<K, V> extends FileInputFormat<K, V> {
long splitSize;
long length;
FileInputFormatForTest(long length, long splitSize) {
this.length = length;
this.splitSize = splitSize;
}
@Override
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
return null;
}
@Override
protected FileStatus[] listStatus(JobConf job) throws IOException {
FileStatus mockFileStatus = mock(FileStatus.class);
when(mockFileStatus.getBlockSize()).thenReturn(splitSize);
when(mockFileStatus.isDirectory()).thenReturn(false);
Path mockPath = mock(Path.class);
FileSystem mockFs = mock(FileSystem.class);
BlockLocation[] blockLocations = mockBlockLocations(length, splitSize);
when(mockFs.getFileBlockLocations(mockFileStatus, 0, length)).thenReturn(
blockLocations);
when(mockPath.getFileSystem(any(Configuration.class))).thenReturn(mockFs);
when(mockFileStatus.getPath()).thenReturn(mockPath);
when(mockFileStatus.getLen()).thenReturn(length);
FileStatus[] fs = new FileStatus[1];
fs[0] = mockFileStatus;
return fs;
}
@Override
protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
return splitSize;
}
private BlockLocation[] mockBlockLocations(long size, long splitSize) {
int numLocations = (int) (size / splitSize);
if (size % splitSize != 0)
numLocations++;
BlockLocation[] blockLocations = new BlockLocation[numLocations];
for (int i = 0; i < numLocations; i++) {
String[] names = new String[] { "b" + i };
String[] hosts = new String[] { "host" + i };
blockLocations[i] = new BlockLocation(names, hosts, i * splitSize,
Math.min(splitSize, size - (splitSize * i)));
}
return blockLocations;
}
}
static void writeFile(Configuration conf, Path name,
short replication, int numBlocks) throws IOException {
FileSystem fileSys = FileSystem.get(conf);

View File

@ -25,7 +25,6 @@ import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.examples.RandomWriter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;

View File

@ -29,7 +29,6 @@ import java.util.Stack;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.examples.RandomTextWriter;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

View File

@ -0,0 +1,757 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* This program uses map/reduce to just run a distributed job where there is
* no interaction between the tasks and each task writes a large unsorted
* random sequence of words.
* In order for this program to generate data for terasort with a 5-10 words
* per key and 20-100 words per value, have the following config:
* <xmp>
* <?xml version="1.0"?>
* <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
* <configuration>
* <property>
* <name>mapreduce.randomtextwriter.minwordskey</name>
* <value>5</value>
* </property>
* <property>
* <name>mapreduce.randomtextwriter.maxwordskey</name>
* <value>10</value>
* </property>
* <property>
* <name>mapreduce.randomtextwriter.minwordsvalue</name>
* <value>20</value>
* </property>
* <property>
* <name>mapreduce.randomtextwriter.maxwordsvalue</name>
* <value>100</value>
* </property>
* <property>
* <name>mapreduce.randomtextwriter.totalbytes</name>
* <value>1099511627776</value>
* </property>
* </configuration></xmp>
*
* Equivalently, {@link RandomTextWriter} also supports all the above options
* and ones supported by {@link Tool} via the command-line.
*
* To run: bin/hadoop jar hadoop-${version}-examples.jar randomtextwriter
* [-outFormat <i>output format class</i>] <i>output</i>
*/
public class RandomTextWriter extends Configured implements Tool {
public static final String TOTAL_BYTES =
"mapreduce.randomtextwriter.totalbytes";
public static final String BYTES_PER_MAP =
"mapreduce.randomtextwriter.bytespermap";
public static final String MAPS_PER_HOST =
"mapreduce.randomtextwriter.mapsperhost";
public static final String MAX_VALUE = "mapreduce.randomtextwriter.maxwordsvalue";
public static final String MIN_VALUE = "mapreduce.randomtextwriter.minwordsvalue";
public static final String MIN_KEY = "mapreduce.randomtextwriter.minwordskey";
public static final String MAX_KEY = "mapreduce.randomtextwriter.maxwordskey";
static int printUsage() {
System.out.println("randomtextwriter " +
"[-outFormat <output format class>] " +
"<output>");
ToolRunner.printGenericCommandUsage(System.out);
return 2;
}
/**
* User counters
*/
static enum Counters { RECORDS_WRITTEN, BYTES_WRITTEN }
static class RandomTextMapper extends Mapper<Text, Text, Text, Text> {
private long numBytesToWrite;
private int minWordsInKey;
private int wordsInKeyRange;
private int minWordsInValue;
private int wordsInValueRange;
private Random random = new Random();
/**
* Save the configuration value that we need to write the data.
*/
public void setup(Context context) {
Configuration conf = context.getConfiguration();
numBytesToWrite = conf.getLong(BYTES_PER_MAP,
1*1024*1024*1024);
minWordsInKey = conf.getInt(MIN_KEY, 5);
wordsInKeyRange = (conf.getInt(MAX_KEY, 10) - minWordsInKey);
minWordsInValue = conf.getInt(MIN_VALUE, 10);
wordsInValueRange = (conf.getInt(MAX_VALUE, 100) - minWordsInValue);
}
/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(Text key, Text value,
Context context) throws IOException,InterruptedException {
int itemCount = 0;
while (numBytesToWrite > 0) {
// Generate the key/value
int noWordsKey = minWordsInKey +
(wordsInKeyRange != 0 ? random.nextInt(wordsInKeyRange) : 0);
int noWordsValue = minWordsInValue +
(wordsInValueRange != 0 ? random.nextInt(wordsInValueRange) : 0);
Text keyWords = generateSentence(noWordsKey);
Text valueWords = generateSentence(noWordsValue);
// Write the sentence
context.write(keyWords, valueWords);
numBytesToWrite -= (keyWords.getLength() + valueWords.getLength());
// Update counters, progress etc.
context.getCounter(Counters.BYTES_WRITTEN).increment(
keyWords.getLength() + valueWords.getLength());
context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
if (++itemCount % 200 == 0) {
context.setStatus("wrote record " + itemCount + ". " +
numBytesToWrite + " bytes left.");
}
}
context.setStatus("done with " + itemCount + " records.");
}
private Text generateSentence(int noWords) {
StringBuffer sentence = new StringBuffer();
String space = " ";
for (int i=0; i < noWords; ++i) {
sentence.append(words[random.nextInt(words.length)]);
sentence.append(space);
}
return new Text(sentence.toString());
}
}
/**
* This is the main routine for launching a distributed random write job.
* It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
* The reduce doesn't do anything.
*
* @throws IOException
*/
public int run(String[] args) throws Exception {
if (args.length == 0) {
return printUsage();
}
Configuration conf = getConf();
JobClient client = new JobClient(conf);
ClusterStatus cluster = client.getClusterStatus();
int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
1*1024*1024*1024);
if (numBytesToWritePerMap == 0) {
System.err.println("Cannot have " + BYTES_PER_MAP +" set to 0");
return -2;
}
long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
if (numMaps == 0 && totalBytesToWrite > 0) {
numMaps = 1;
conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
}
conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
Job job = new Job(conf);
job.setJarByClass(RandomTextWriter.class);
job.setJobName("random-text-writer");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
job.setMapperClass(RandomTextMapper.class);
Class<? extends OutputFormat> outputFormatClass =
SequenceFileOutputFormat.class;
List<String> otherArgs = new ArrayList<String>();
for(int i=0; i < args.length; ++i) {
try {
if ("-outFormat".equals(args[i])) {
outputFormatClass =
Class.forName(args[++i]).asSubclass(OutputFormat.class);
} else {
otherArgs.add(args[i]);
}
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " +
args[i-1]);
return printUsage(); // exits
}
}
job.setOutputFormatClass(outputFormatClass);
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
System.out.println("Running " + numMaps + " maps.");
// reducer NONE
job.setNumReduceTasks(0);
Date startTime = new Date();
System.out.println("Job started: " + startTime);
int ret = job.waitForCompletion(true) ? 0 : 1;
Date endTime = new Date();
System.out.println("Job ended: " + endTime);
System.out.println("The job took " +
(endTime.getTime() - startTime.getTime()) /1000 +
" seconds.");
return ret;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new RandomTextWriter(), args);
System.exit(res);
}
/**
* A random list of 100 words from /usr/share/dict/words
*/
private static String[] words = {
"diurnalness", "Homoiousian",
"spiranthic", "tetragynian",
"silverhead", "ungreat",
"lithograph", "exploiter",
"physiologian", "by",
"hellbender", "Filipendula",
"undeterring", "antiscolic",
"pentagamist", "hypoid",
"cacuminal", "sertularian",
"schoolmasterism", "nonuple",
"gallybeggar", "phytonic",
"swearingly", "nebular",
"Confervales", "thermochemically",
"characinoid", "cocksuredom",
"fallacious", "feasibleness",
"debromination", "playfellowship",
"tramplike", "testa",
"participatingly", "unaccessible",
"bromate", "experientialist",
"roughcast", "docimastical",
"choralcelo", "blightbird",
"peptonate", "sombreroed",
"unschematized", "antiabolitionist",
"besagne", "mastication",
"bromic", "sviatonosite",
"cattimandoo", "metaphrastical",
"endotheliomyoma", "hysterolysis",
"unfulminated", "Hester",
"oblongly", "blurredness",
"authorling", "chasmy",
"Scorpaenidae", "toxihaemia",
"Dictograph", "Quakerishly",
"deaf", "timbermonger",
"strammel", "Thraupidae",
"seditious", "plerome",
"Arneb", "eristically",
"serpentinic", "glaumrie",
"socioromantic", "apocalypst",
"tartrous", "Bassaris",
"angiolymphoma", "horsefly",
"kenno", "astronomize",
"euphemious", "arsenide",
"untongued", "parabolicness",
"uvanite", "helpless",
"gemmeous", "stormy",
"templar", "erythrodextrin",
"comism", "interfraternal",
"preparative", "parastas",
"frontoorbital", "Ophiosaurus",
"diopside", "serosanguineous",
"ununiformly", "karyological",
"collegian", "allotropic",
"depravity", "amylogenesis",
"reformatory", "epidymides",
"pleurotropous", "trillium",
"dastardliness", "coadvice",
"embryotic", "benthonic",
"pomiferous", "figureheadship",
"Megaluridae", "Harpa",
"frenal", "commotion",
"abthainry", "cobeliever",
"manilla", "spiciferous",
"nativeness", "obispo",
"monilioid", "biopsic",
"valvula", "enterostomy",
"planosubulate", "pterostigma",
"lifter", "triradiated",
"venialness", "tum",
"archistome", "tautness",
"unswanlike", "antivenin",
"Lentibulariaceae", "Triphora",
"angiopathy", "anta",
"Dawsonia", "becomma",
"Yannigan", "winterproof",
"antalgol", "harr",
"underogating", "ineunt",
"cornberry", "flippantness",
"scyphostoma", "approbation",
"Ghent", "Macraucheniidae",
"scabbiness", "unanatomized",
"photoelasticity", "eurythermal",
"enation", "prepavement",
"flushgate", "subsequentially",
"Edo", "antihero",
"Isokontae", "unforkedness",
"porriginous", "daytime",
"nonexecutive", "trisilicic",
"morphiomania", "paranephros",
"botchedly", "impugnation",
"Dodecatheon", "obolus",
"unburnt", "provedore",
"Aktistetae", "superindifference",
"Alethea", "Joachimite",
"cyanophilous", "chorograph",
"brooky", "figured",
"periclitation", "quintette",
"hondo", "ornithodelphous",
"unefficient", "pondside",
"bogydom", "laurinoxylon",
"Shiah", "unharmed",
"cartful", "noncrystallized",
"abusiveness", "cromlech",
"japanned", "rizzomed",
"underskin", "adscendent",
"allectory", "gelatinousness",
"volcano", "uncompromisingly",
"cubit", "idiotize",
"unfurbelowed", "undinted",
"magnetooptics", "Savitar",
"diwata", "ramosopalmate",
"Pishquow", "tomorn",
"apopenptic", "Haversian",
"Hysterocarpus", "ten",
"outhue", "Bertat",
"mechanist", "asparaginic",
"velaric", "tonsure",
"bubble", "Pyrales",
"regardful", "glyphography",
"calabazilla", "shellworker",
"stradametrical", "havoc",
"theologicopolitical", "sawdust",
"diatomaceous", "jajman",
"temporomastoid", "Serrifera",
"Ochnaceae", "aspersor",
"trailmaking", "Bishareen",
"digitule", "octogynous",
"epididymitis", "smokefarthings",
"bacillite", "overcrown",
"mangonism", "sirrah",
"undecorated", "psychofugal",
"bismuthiferous", "rechar",
"Lemuridae", "frameable",
"thiodiazole", "Scanic",
"sportswomanship", "interruptedness",
"admissory", "osteopaedion",
"tingly", "tomorrowness",
"ethnocracy", "trabecular",
"vitally", "fossilism",
"adz", "metopon",
"prefatorial", "expiscate",
"diathermacy", "chronist",
"nigh", "generalizable",
"hysterogen", "aurothiosulphuric",
"whitlowwort", "downthrust",
"Protestantize", "monander",
"Itea", "chronographic",
"silicize", "Dunlop",
"eer", "componental",
"spot", "pamphlet",
"antineuritic", "paradisean",
"interruptor", "debellator",
"overcultured", "Florissant",
"hyocholic", "pneumatotherapy",
"tailoress", "rave",
"unpeople", "Sebastian",
"thermanesthesia", "Coniferae",
"swacking", "posterishness",
"ethmopalatal", "whittle",
"analgize", "scabbardless",
"naught", "symbiogenetically",
"trip", "parodist",
"columniform", "trunnel",
"yawler", "goodwill",
"pseudohalogen", "swangy",
"cervisial", "mediateness",
"genii", "imprescribable",
"pony", "consumptional",
"carposporangial", "poleax",
"bestill", "subfebrile",
"sapphiric", "arrowworm",
"qualminess", "ultraobscure",
"thorite", "Fouquieria",
"Bermudian", "prescriber",
"elemicin", "warlike",
"semiangle", "rotular",
"misthread", "returnability",
"seraphism", "precostal",
"quarried", "Babylonism",
"sangaree", "seelful",
"placatory", "pachydermous",
"bozal", "galbulus",
"spermaphyte", "cumbrousness",
"pope", "signifier",
"Endomycetaceae", "shallowish",
"sequacity", "periarthritis",
"bathysphere", "pentosuria",
"Dadaism", "spookdom",
"Consolamentum", "afterpressure",
"mutter", "louse",
"ovoviviparous", "corbel",
"metastoma", "biventer",
"Hydrangea", "hogmace",
"seizing", "nonsuppressed",
"oratorize", "uncarefully",
"benzothiofuran", "penult",
"balanocele", "macropterous",
"dishpan", "marten",
"absvolt", "jirble",
"parmelioid", "airfreighter",
"acocotl", "archesporial",
"hypoplastral", "preoral",
"quailberry", "cinque",
"terrestrially", "stroking",
"limpet", "moodishness",
"canicule", "archididascalian",
"pompiloid", "overstaid",
"introducer", "Italical",
"Christianopaganism", "prescriptible",
"subofficer", "danseuse",
"cloy", "saguran",
"frictionlessly", "deindividualization",
"Bulanda", "ventricous",
"subfoliar", "basto",
"scapuloradial", "suspend",
"stiffish", "Sphenodontidae",
"eternal", "verbid",
"mammonish", "upcushion",
"barkometer", "concretion",
"preagitate", "incomprehensible",
"tristich", "visceral",
"hemimelus", "patroller",
"stentorophonic", "pinulus",
"kerykeion", "brutism",
"monstership", "merciful",
"overinstruct", "defensibly",
"bettermost", "splenauxe",
"Mormyrus", "unreprimanded",
"taver", "ell",
"proacquittal", "infestation",
"overwoven", "Lincolnlike",
"chacona", "Tamil",
"classificational", "lebensraum",
"reeveland", "intuition",
"Whilkut", "focaloid",
"Eleusinian", "micromembrane",
"byroad", "nonrepetition",
"bacterioblast", "brag",
"ribaldrous", "phytoma",
"counteralliance", "pelvimetry",
"pelf", "relaster",
"thermoresistant", "aneurism",
"molossic", "euphonym",
"upswell", "ladhood",
"phallaceous", "inertly",
"gunshop", "stereotypography",
"laryngic", "refasten",
"twinling", "oflete",
"hepatorrhaphy", "electrotechnics",
"cockal", "guitarist",
"topsail", "Cimmerianism",
"larklike", "Llandovery",
"pyrocatechol", "immatchable",
"chooser", "metrocratic",
"craglike", "quadrennial",
"nonpoisonous", "undercolored",
"knob", "ultratense",
"balladmonger", "slait",
"sialadenitis", "bucketer",
"magnificently", "unstipulated",
"unscourged", "unsupercilious",
"packsack", "pansophism",
"soorkee", "percent",
"subirrigate", "champer",
"metapolitics", "spherulitic",
"involatile", "metaphonical",
"stachyuraceous", "speckedness",
"bespin", "proboscidiform",
"gul", "squit",
"yeelaman", "peristeropode",
"opacousness", "shibuichi",
"retinize", "yote",
"misexposition", "devilwise",
"pumpkinification", "vinny",
"bonze", "glossing",
"decardinalize", "transcortical",
"serphoid", "deepmost",
"guanajuatite", "wemless",
"arval", "lammy",
"Effie", "Saponaria",
"tetrahedral", "prolificy",
"excerpt", "dunkadoo",
"Spencerism", "insatiately",
"Gilaki", "oratorship",
"arduousness", "unbashfulness",
"Pithecolobium", "unisexuality",
"veterinarian", "detractive",
"liquidity", "acidophile",
"proauction", "sural",
"totaquina", "Vichyite",
"uninhabitedness", "allegedly",
"Gothish", "manny",
"Inger", "flutist",
"ticktick", "Ludgatian",
"homotransplant", "orthopedical",
"diminutively", "monogoneutic",
"Kenipsim", "sarcologist",
"drome", "stronghearted",
"Fameuse", "Swaziland",
"alen", "chilblain",
"beatable", "agglomeratic",
"constitutor", "tendomucoid",
"porencephalous", "arteriasis",
"boser", "tantivy",
"rede", "lineamental",
"uncontradictableness", "homeotypical",
"masa", "folious",
"dosseret", "neurodegenerative",
"subtransverse", "Chiasmodontidae",
"palaeotheriodont", "unstressedly",
"chalcites", "piquantness",
"lampyrine", "Aplacentalia",
"projecting", "elastivity",
"isopelletierin", "bladderwort",
"strander", "almud",
"iniquitously", "theologal",
"bugre", "chargeably",
"imperceptivity", "meriquinoidal",
"mesophyte", "divinator",
"perfunctory", "counterappellant",
"synovial", "charioteer",
"crystallographical", "comprovincial",
"infrastapedial", "pleasurehood",
"inventurous", "ultrasystematic",
"subangulated", "supraoesophageal",
"Vaishnavism", "transude",
"chrysochrous", "ungrave",
"reconciliable", "uninterpleaded",
"erlking", "wherefrom",
"aprosopia", "antiadiaphorist",
"metoxazine", "incalculable",
"umbellic", "predebit",
"foursquare", "unimmortal",
"nonmanufacture", "slangy",
"predisputant", "familist",
"preaffiliate", "friarhood",
"corelysis", "zoonitic",
"halloo", "paunchy",
"neuromimesis", "aconitine",
"hackneyed", "unfeeble",
"cubby", "autoschediastical",
"naprapath", "lyrebird",
"inexistency", "leucophoenicite",
"ferrogoslarite", "reperuse",
"uncombable", "tambo",
"propodiale", "diplomatize",
"Russifier", "clanned",
"corona", "michigan",
"nonutilitarian", "transcorporeal",
"bought", "Cercosporella",
"stapedius", "glandularly",
"pictorially", "weism",
"disilane", "rainproof",
"Caphtor", "scrubbed",
"oinomancy", "pseudoxanthine",
"nonlustrous", "redesertion",
"Oryzorictinae", "gala",
"Mycogone", "reappreciate",
"cyanoguanidine", "seeingness",
"breadwinner", "noreast",
"furacious", "epauliere",
"omniscribent", "Passiflorales",
"uninductive", "inductivity",
"Orbitolina", "Semecarpus",
"migrainoid", "steprelationship",
"phlogisticate", "mesymnion",
"sloped", "edificator",
"beneficent", "culm",
"paleornithology", "unurban",
"throbless", "amplexifoliate",
"sesquiquintile", "sapience",
"astucious", "dithery",
"boor", "ambitus",
"scotching", "uloid",
"uncompromisingness", "hoove",
"waird", "marshiness",
"Jerusalem", "mericarp",
"unevoked", "benzoperoxide",
"outguess", "pyxie",
"hymnic", "euphemize",
"mendacity", "erythremia",
"rosaniline", "unchatteled",
"lienteria", "Bushongo",
"dialoguer", "unrepealably",
"rivethead", "antideflation",
"vinegarish", "manganosiderite",
"doubtingness", "ovopyriform",
"Cephalodiscus", "Muscicapa",
"Animalivora", "angina",
"planispheric", "ipomoein",
"cuproiodargyrite", "sandbox",
"scrat", "Munnopsidae",
"shola", "pentafid",
"overstudiousness", "times",
"nonprofession", "appetible",
"valvulotomy", "goladar",
"uniarticular", "oxyterpene",
"unlapsing", "omega",
"trophonema", "seminonflammable",
"circumzenithal", "starer",
"depthwise", "liberatress",
"unleavened", "unrevolting",
"groundneedle", "topline",
"wandoo", "umangite",
"ordinant", "unachievable",
"oversand", "snare",
"avengeful", "unexplicit",
"mustafina", "sonable",
"rehabilitative", "eulogization",
"papery", "technopsychology",
"impressor", "cresylite",
"entame", "transudatory",
"scotale", "pachydermatoid",
"imaginary", "yeat",
"slipped", "stewardship",
"adatom", "cockstone",
"skyshine", "heavenful",
"comparability", "exprobratory",
"dermorhynchous", "parquet",
"cretaceous", "vesperal",
"raphis", "undangered",
"Glecoma", "engrain",
"counteractively", "Zuludom",
"orchiocatabasis", "Auriculariales",
"warriorwise", "extraorganismal",
"overbuilt", "alveolite",
"tetchy", "terrificness",
"widdle", "unpremonished",
"rebilling", "sequestrum",
"equiconvex", "heliocentricism",
"catabaptist", "okonite",
"propheticism", "helminthagogic",
"calycular", "giantly",
"wingable", "golem",
"unprovided", "commandingness",
"greave", "haply",
"doina", "depressingly",
"subdentate", "impairment",
"decidable", "neurotrophic",
"unpredict", "bicorporeal",
"pendulant", "flatman",
"intrabred", "toplike",
"Prosobranchiata", "farrantly",
"toxoplasmosis", "gorilloid",
"dipsomaniacal", "aquiline",
"atlantite", "ascitic",
"perculsive", "prospectiveness",
"saponaceous", "centrifugalization",
"dinical", "infravaginal",
"beadroll", "affaite",
"Helvidian", "tickleproof",
"abstractionism", "enhedge",
"outwealth", "overcontribute",
"coldfinch", "gymnastic",
"Pincian", "Munychian",
"codisjunct", "quad",
"coracomandibular", "phoenicochroite",
"amender", "selectivity",
"putative", "semantician",
"lophotrichic", "Spatangoidea",
"saccharogenic", "inferent",
"Triconodonta", "arrendation",
"sheepskin", "taurocolla",
"bunghole", "Machiavel",
"triakistetrahedral", "dehairer",
"prezygapophysial", "cylindric",
"pneumonalgia", "sleigher",
"emir", "Socraticism",
"licitness", "massedly",
"instructiveness", "sturdied",
"redecrease", "starosta",
"evictor", "orgiastic",
"squdge", "meloplasty",
"Tsonecan", "repealableness",
"swoony", "myesthesia",
"molecule", "autobiographist",
"reciprocation", "refective",
"unobservantness", "tricae",
"ungouged", "floatability",
"Mesua", "fetlocked",
"chordacentrum", "sedentariness",
"various", "laubanite",
"nectopod", "zenick",
"sequentially", "analgic",
"biodynamics", "posttraumatic",
"nummi", "pyroacetic",
"bot", "redescend",
"dispermy", "undiffusive",
"circular", "trillion",
"Uraniidae", "ploration",
"discipular", "potentness",
"sud", "Hu",
"Eryon", "plugger",
"subdrainage", "jharal",
"abscission", "supermarket",
"countergabion", "glacierist",
"lithotresis", "minniebush",
"zanyism", "eucalypteol",
"sterilely", "unrealize",
"unpatched", "hypochondriacism",
"critically", "cheesecutter",
};
}

View File

@ -0,0 +1,298 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* This program uses map/reduce to just run a distributed job where there is
* no interaction between the tasks and each task write a large unsorted
* random binary sequence file of BytesWritable.
* In order for this program to generate data for terasort with 10-byte keys
* and 90-byte values, have the following config:
* <xmp>
* <?xml version="1.0"?>
* <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
* <configuration>
* <property>
* <name>mapreduce.randomwriter.minkey</name>
* <value>10</value>
* </property>
* <property>
* <name>mapreduce.randomwriter.maxkey</name>
* <value>10</value>
* </property>
* <property>
* <name>mapreduce.randomwriter.minvalue</name>
* <value>90</value>
* </property>
* <property>
* <name>mapreduce.randomwriter.maxvalue</name>
* <value>90</value>
* </property>
* <property>
* <name>mapreduce.randomwriter.totalbytes</name>
* <value>1099511627776</value>
* </property>
* </configuration></xmp>
*
* Equivalently, {@link RandomWriter} also supports all the above options
* and ones supported by {@link GenericOptionsParser} via the command-line.
*/
public class RandomWriter extends Configured implements Tool {
public static final String TOTAL_BYTES = "mapreduce.randomwriter.totalbytes";
public static final String BYTES_PER_MAP =
"mapreduce.randomwriter.bytespermap";
public static final String MAPS_PER_HOST =
"mapreduce.randomwriter.mapsperhost";
public static final String MAX_VALUE = "mapreduce.randomwriter.maxvalue";
public static final String MIN_VALUE = "mapreduce.randomwriter.minvalue";
public static final String MIN_KEY = "mapreduce.randomwriter.minkey";
public static final String MAX_KEY = "mapreduce.randomwriter.maxkey";
/**
* User counters
*/
static enum Counters { RECORDS_WRITTEN, BYTES_WRITTEN }
/**
* A custom input format that creates virtual inputs of a single string
* for each map.
*/
static class RandomInputFormat extends InputFormat<Text, Text> {
/**
* Generate the requested number of file splits, with the filename
* set to the filename of the output file.
*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
List<InputSplit> result = new ArrayList<InputSplit>();
Path outDir = FileOutputFormat.getOutputPath(job);
int numSplits =
job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
for(int i=0; i < numSplits; ++i) {
result.add(new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1,
(String[])null));
}
return result;
}
/**
* Return a single record (filename, "") where the filename is taken from
* the file split.
*/
static class RandomRecordReader extends RecordReader<Text, Text> {
Path name;
Text key = null;
Text value = new Text();
public RandomRecordReader(Path p) {
name = p;
}
public void initialize(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
}
public boolean nextKeyValue() {
if (name != null) {
key = new Text();
key.set(name.getName());
name = null;
return true;
}
return false;
}
public Text getCurrentKey() {
return key;
}
public Text getCurrentValue() {
return value;
}
public void close() {}
public float getProgress() {
return 0.0f;
}
}
public RecordReader<Text, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return new RandomRecordReader(((FileSplit) split).getPath());
}
}
static class RandomMapper extends Mapper<WritableComparable, Writable,
BytesWritable, BytesWritable> {
private long numBytesToWrite;
private int minKeySize;
private int keySizeRange;
private int minValueSize;
private int valueSizeRange;
private Random random = new Random();
private BytesWritable randomKey = new BytesWritable();
private BytesWritable randomValue = new BytesWritable();
private void randomizeBytes(byte[] data, int offset, int length) {
for(int i=offset + length - 1; i >= offset; --i) {
data[i] = (byte) random.nextInt(256);
}
}
/**
* Given an output filename, write a bunch of random records to it.
*/
public void map(WritableComparable key,
Writable value,
Context context) throws IOException,InterruptedException {
int itemCount = 0;
while (numBytesToWrite > 0) {
int keyLength = minKeySize +
(keySizeRange != 0 ? random.nextInt(keySizeRange) : 0);
randomKey.setSize(keyLength);
randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength());
int valueLength = minValueSize +
(valueSizeRange != 0 ? random.nextInt(valueSizeRange) : 0);
randomValue.setSize(valueLength);
randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength());
context.write(randomKey, randomValue);
numBytesToWrite -= keyLength + valueLength;
context.getCounter(Counters.BYTES_WRITTEN).increment(keyLength + valueLength);
context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
if (++itemCount % 200 == 0) {
context.setStatus("wrote record " + itemCount + ". " +
numBytesToWrite + " bytes left.");
}
}
context.setStatus("done with " + itemCount + " records.");
}
/**
* Save the values out of the configuaration that we need to write
* the data.
*/
@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
numBytesToWrite = conf.getLong(BYTES_PER_MAP,
1*1024*1024*1024);
minKeySize = conf.getInt(MIN_KEY, 10);
keySizeRange =
conf.getInt(MAX_KEY, 1000) - minKeySize;
minValueSize = conf.getInt(MIN_VALUE, 0);
valueSizeRange =
conf.getInt(MAX_VALUE, 20000) - minValueSize;
}
}
/**
* This is the main routine for launching a distributed random write job.
* It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
* The reduce doesn't do anything.
*
* @throws IOException
*/
public int run(String[] args) throws Exception {
if (args.length == 0) {
System.out.println("Usage: writer <out-dir>");
ToolRunner.printGenericCommandUsage(System.out);
return 2;
}
Path outDir = new Path(args[0]);
Configuration conf = getConf();
JobClient client = new JobClient(conf);
ClusterStatus cluster = client.getClusterStatus();
int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
1*1024*1024*1024);
if (numBytesToWritePerMap == 0) {
System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
return -2;
}
long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
if (numMaps == 0 && totalBytesToWrite > 0) {
numMaps = 1;
conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
}
conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
Job job = new Job(conf);
job.setJarByClass(RandomWriter.class);
job.setJobName("random-writer");
FileOutputFormat.setOutputPath(job, outDir);
job.setOutputKeyClass(BytesWritable.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(RandomInputFormat.class);
job.setMapperClass(RandomMapper.class);
job.setReducerClass(Reducer.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
System.out.println("Running " + numMaps + " maps.");
// reducer NONE
job.setNumReduceTasks(0);
Date startTime = new Date();
System.out.println("Job started: " + startTime);
int ret = job.waitForCompletion(true) ? 0 : 1;
Date endTime = new Date();
System.out.println("Job ended: " + endTime);
System.out.println("The job took " +
(endTime.getTime() - startTime.getTime()) /1000 +
" seconds.");
return ret;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new RandomWriter(), args);
System.exit(res);
}
}

View File

@ -19,7 +19,9 @@
package org.apache.hadoop.mapreduce.lib.input;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import static org.junit.Assert.*;
@ -28,10 +30,15 @@ import static org.mockito.Mockito.*;
import static org.apache.hadoop.test.MockitoMaker.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class TestFileInputFormat {
@ -80,4 +87,108 @@ public class TestFileInputFormat {
ispy.getSplits(job);
verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1);
}
@Test
@SuppressWarnings({"rawtypes", "unchecked"})
public void testLastInputSplitAtSplitBoundary() throws Exception {
FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024,
128l * 1024 * 1024);
Configuration conf = new Configuration();
JobContext jobContext = mock(JobContext.class);
when(jobContext.getConfiguration()).thenReturn(conf);
List<InputSplit> splits = fif.getSplits(jobContext);
assertEquals(8, splits.size());
for (int i = 0 ; i < splits.size() ; i++) {
InputSplit split = splits.get(i);
assertEquals(("host" + i), split.getLocations()[0]);
}
}
@Test
@SuppressWarnings({ "rawtypes", "unchecked" })
public void testLastInputSplitExceedingSplitBoundary() throws Exception {
FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024,
128l * 1024 * 1024);
Configuration conf = new Configuration();
JobContext jobContext = mock(JobContext.class);
when(jobContext.getConfiguration()).thenReturn(conf);
List<InputSplit> splits = fif.getSplits(jobContext);
assertEquals(8, splits.size());
for (int i = 0; i < splits.size(); i++) {
InputSplit split = splits.get(i);
assertEquals(("host" + i), split.getLocations()[0]);
}
}
@Test
@SuppressWarnings({ "rawtypes", "unchecked" })
public void testLastInputSplitSingleSplit() throws Exception {
FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024,
128l * 1024 * 1024);
Configuration conf = new Configuration();
JobContext jobContext = mock(JobContext.class);
when(jobContext.getConfiguration()).thenReturn(conf);
List<InputSplit> splits = fif.getSplits(jobContext);
assertEquals(1, splits.size());
for (int i = 0; i < splits.size(); i++) {
InputSplit split = splits.get(i);
assertEquals(("host" + i), split.getLocations()[0]);
}
}
private class FileInputFormatForTest<K, V> extends FileInputFormat<K, V> {
long splitSize;
long length;
FileInputFormatForTest(long length, long splitSize) {
this.length = length;
this.splitSize = splitSize;
}
@Override
public RecordReader<K, V> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return null;
}
@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
FileStatus mockFileStatus = mock(FileStatus.class);
when(mockFileStatus.getBlockSize()).thenReturn(splitSize);
Path mockPath = mock(Path.class);
FileSystem mockFs = mock(FileSystem.class);
BlockLocation[] blockLocations = mockBlockLocations(length, splitSize);
when(mockFs.getFileBlockLocations(mockFileStatus, 0, length)).thenReturn(
blockLocations);
when(mockPath.getFileSystem(any(Configuration.class))).thenReturn(mockFs);
when(mockFileStatus.getPath()).thenReturn(mockPath);
when(mockFileStatus.getLen()).thenReturn(length);
List<FileStatus> list = new ArrayList<FileStatus>();
list.add(mockFileStatus);
return list;
}
@Override
protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
return splitSize;
}
private BlockLocation[] mockBlockLocations(long size, long splitSize) {
int numLocations = (int) (size / splitSize);
if (size % splitSize != 0)
numLocations++;
BlockLocation[] blockLocations = new BlockLocation[numLocations];
for (int i = 0; i < numLocations; i++) {
String[] names = new String[] { "b" + i };
String[] hosts = new String[] { "host" + i };
blockLocations[i] = new BlockLocation(names, hosts, i * splitSize,
Math.min(splitSize, size - (splitSize * i)));
}
return blockLocations;
}
}
}

View File

@ -238,9 +238,11 @@ public class TeraGen extends Configured implements Tool {
@Override
public void cleanup(Context context) {
if (checksumCounter != null) {
checksumCounter.increment(total.getLow8());
}
}
}
private static void usage() throws IOException {
System.err.println("teragen <num rows> <output dir>");
@ -307,5 +309,4 @@ public class TeraGen extends Configured implements Tool {
int res = ToolRunner.run(new Configuration(), new TeraGen(), args);
System.exit(res);
}
}

View File

@ -85,20 +85,6 @@ public interface ApplicationConstants {
public static final String STDOUT = "stdout";
/**
* Classpath for typical applications.
*/
public static final String[] APPLICATION_CLASSPATH =
new String[] {
"$HADOOP_CONF_DIR",
"$HADOOP_COMMON_HOME/share/hadoop/common/*",
"$HADOOP_COMMON_HOME/share/hadoop/common/lib/*",
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/*",
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*",
"$YARN_HOME/share/hadoop/mapreduce/*",
"$YARN_HOME/share/hadoop/mapreduce/lib/*"
};
/**
* Environment for Applications.
*

View File

@ -508,6 +508,10 @@ public class YarnConfiguration extends Configuration {
public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
2000;
/** Standard Hadoop classes */
public static final String YARN_APPLICATION_CLASSPATH = YARN_PREFIX
+ "application.classpath";
public YarnConfiguration() {
super();
}

View File

@ -36,6 +36,7 @@ import com.google.common.collect.Lists;
import com.google.inject.Provides;
import com.google.inject.servlet.GuiceFilter;
import com.google.inject.servlet.ServletModule;
import com.sun.jersey.api.container.filter.GZIPContentEncodingFilter;
import com.sun.jersey.api.core.ResourceConfig;
import com.sun.jersey.core.util.FeaturesAndProperties;
import com.sun.jersey.guice.spi.container.servlet.GuiceContainer;
@ -160,6 +161,8 @@ public abstract class WebApp extends ServletModule {
params.put(ResourceConfig.FEATURE_IMPLICIT_VIEWABLES, "true");
params.put(ServletContainer.FEATURE_FILTER_FORWARD_ON_404, "true");
params.put(FeaturesAndProperties.FEATURE_XMLROOTELEMENT_PROCESSING, "true");
params.put(ResourceConfig.PROPERTY_CONTAINER_REQUEST_FILTERS, GZIPContentEncodingFilter.class.getName());
params.put(ResourceConfig.PROPERTY_CONTAINER_RESPONSE_FILTERS, GZIPContentEncodingFilter.class.getName());
filter("/*").through(GuiceContainer.class, params);
}

View File

@ -482,4 +482,18 @@
<name>yarn.web-proxy.address</name>
<value/>
</property>
<property>
<description>Classpath for typical applications.</description>
<name>yarn.application.classpath</name>
<value>
$HADOOP_CONF_DIR,
$HADOOP_COMMON_HOME/share/hadoop/common/*,
$HADOOP_COMMON_HOME/share/hadoop/common/lib/*,
$HADOOP_HDFS_HOME/share/hadoop/hdfs/*,
$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
$YARN_HOME/share/hadoop/mapreduce/*,
$YARN_HOME/share/hadoop/mapreduce/lib/*
</value>
</property>
</configuration>

View File

@ -295,10 +295,6 @@ public class SchedulerApp {
}
}
public synchronized void setAvailableResourceLimit(Resource globalLimit) {
this.resourceLimit = globalLimit;
}
public synchronized RMContainer getRMContainer(ContainerId id) {
return liveContainers.get(id);
}
@ -446,20 +442,21 @@ public class SchedulerApp {
return reservedContainers;
}
public synchronized void setHeadroom(Resource globalLimit) {
this.resourceLimit = globalLimit;
}
/**
* Get available headroom in terms of resources for the application's user.
* @return available resource headroom
*/
public synchronized Resource getHeadroom() {
Resource limit = Resources.subtract(resourceLimit, currentConsumption);
Resources.subtractFrom(limit, currentReservation);
// Corner case to deal with applications being slightly over-limit
if (limit.getMemory() < 0) {
limit.setMemory(0);
if (resourceLimit.getMemory() < 0) {
resourceLimit.setMemory(0);
}
return limit;
return resourceLimit;
}
public Queue getQueue() {

View File

@ -17,12 +17,19 @@
*/
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.yarn.api.records.Resource;
class CSQueueUtils {
public static void checkMaxCapacity(String queueName,
float capacity, float maximumCapacity) {
if (Math.round(100 * maximumCapacity) != CapacitySchedulerConfiguration.UNDEFINED &&
if (maximumCapacity < 0.0f || maximumCapacity > 1.0f ||
maximumCapacity < capacity) {
throw new IllegalArgumentException(
"Illegal value of maximumCapacity " + maximumCapacity +
" used in call to setMaxCapacity for queue " + queueName);
}
if (maximumCapacity < capacity) {
throw new IllegalArgumentException(
"Illegal call to setMaxCapacity. " +
"Queue '" + queueName + "' has " +
@ -31,4 +38,25 @@ class CSQueueUtils {
}
}
public static float computeAbsoluteMaximumCapacity(
float maximumCapacity, CSQueue parent) {
float parentAbsMaxCapacity =
(parent == null) ? 1.0f : parent.getAbsoluteMaximumCapacity();
return (parentAbsMaxCapacity * maximumCapacity);
}
public static int computeMaxActiveApplications(Resource clusterResource,
float maxAMResourcePercent, float absoluteCapacity) {
return
Math.max(
(int)((clusterResource.getMemory() / (float)LeafQueue.DEFAULT_AM_RESOURCE) *
maxAMResourcePercent * absoluteCapacity),
1);
}
public static int computeMaxActiveApplicationsPerUser(
int maxActiveApplications, int userLimit, float userLimitFactor) {
return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
}
}

View File

@ -149,7 +149,7 @@ public class CapacitySchedulerConfiguration extends Configuration {
throw new IllegalArgumentException("Illegal " +
"capacity of " + capacity + " for queue " + queue);
}
LOG.debug("CSConf - setCapacity: queuePrefix=" + getQueuePrefix(queue) +
LOG.debug("CSConf - getCapacity: queuePrefix=" + getQueuePrefix(queue) +
", capacity=" + capacity);
return capacity;
}
@ -162,11 +162,15 @@ public class CapacitySchedulerConfiguration extends Configuration {
public int getMaximumCapacity(String queue) {
int maxCapacity =
getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, UNDEFINED);
getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, MAXIMUM_CAPACITY_VALUE);
return maxCapacity;
}
public void setMaximumCapacity(String queue, int maxCapacity) {
if (maxCapacity > MAXIMUM_CAPACITY_VALUE) {
throw new IllegalArgumentException("Illegal " +
"maximum-capacity of " + maxCapacity + " for queue " + queue);
}
setInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, maxCapacity);
LOG.debug("CSConf - setMaxCapacity: queuePrefix=" + getQueuePrefix(queue) +
", maxCapacity=" + maxCapacity);

View File

@ -144,10 +144,10 @@ public class LeafQueue implements CSQueue {
(float)cs.getConfiguration().getCapacity(getQueuePath()) / 100;
float absoluteCapacity = parent.getAbsoluteCapacity() * capacity;
float maximumCapacity = (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
float maximumCapacity =
(float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
float absoluteMaxCapacity =
(Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ?
Float.MAX_VALUE : (parent.getAbsoluteCapacity() * maximumCapacity);
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
int userLimit = cs.getConfiguration().getUserLimit(getQueuePath());
float userLimitFactor =
@ -161,10 +161,10 @@ public class LeafQueue implements CSQueue {
this.maxAMResourcePercent =
cs.getConfiguration().getMaximumApplicationMasterResourcePercent();
int maxActiveApplications =
computeMaxActiveApplications(cs.getClusterResources(),
CSQueueUtils.computeMaxActiveApplications(cs.getClusterResources(),
maxAMResourcePercent, absoluteCapacity);
int maxActiveApplicationsPerUser =
computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
userLimitFactor);
this.queueInfo = recordFactory.newRecordInstance(QueueInfo.class);
@ -193,20 +193,6 @@ public class LeafQueue implements CSQueue {
this.activeApplications = new TreeSet<SchedulerApp>(applicationComparator);
}
private int computeMaxActiveApplications(Resource clusterResource,
float maxAMResourcePercent, float absoluteCapacity) {
return
Math.max(
(int)((clusterResource.getMemory() / (float)DEFAULT_AM_RESOURCE) *
maxAMResourcePercent * absoluteCapacity),
1);
}
private int computeMaxActiveApplicationsPerUser(int maxActiveApplications,
int userLimit, float userLimitFactor) {
return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
}
private synchronized void setupQueueConfigs(
float capacity, float absoluteCapacity,
float maximumCapacity, float absoluteMaxCapacity,
@ -254,8 +240,8 @@ public class LeafQueue implements CSQueue {
"maxCapacity = " + maximumCapacity +
" [= configuredMaxCapacity ]" + "\n" +
"absoluteMaxCapacity = " + absoluteMaxCapacity +
" [= Float.MAX_VALUE if maximumCapacity undefined, " +
"(parentAbsoluteCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
" [= 1.0 maximumCapacity undefined, " +
"(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
"userLimit = " + userLimit +
" [= configuredUserLimit ]" + "\n" +
"userLimitFactor = " + userLimitFactor +
@ -272,9 +258,9 @@ public class LeafQueue implements CSQueue {
"maxActiveApplicationsPerUser = " + maxActiveApplicationsPerUser +
" [= (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor) ]" + "\n" +
"utilization = " + utilization +
" [= usedResourcesMemory / queueLimit ]" + "\n" +
" [= usedResourcesMemory / (clusterResourceMemory * absoluteCapacity)]" + "\n" +
"usedCapacity = " + usedCapacity +
" [= usedResourcesMemory / (clusterResourceMemory * capacity) ]" + "\n" +
" [= usedResourcesMemory / (clusterResourceMemory * parent.absoluteCapacity)]" + "\n" +
"maxAMResourcePercent = " + maxAMResourcePercent +
" [= configuredMaximumAMResourcePercent ]" + "\n" +
"minimumAllocationFactor = " + minimumAllocationFactor +
@ -400,9 +386,7 @@ public class LeafQueue implements CSQueue {
this.maximumCapacity = maximumCapacity;
this.absoluteMaxCapacity =
(Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ?
Float.MAX_VALUE :
(parent.getAbsoluteCapacity() * maximumCapacity);
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
}
/**
@ -502,9 +486,14 @@ public class LeafQueue implements CSQueue {
}
public String toString() {
return queueName + ":" + capacity + ":" + absoluteCapacity + ":" +
getUsedCapacity() + ":" + getUtilization() + ":" +
getNumApplications() + ":" + getNumContainers();
return queueName + ": " +
"capacity=" + capacity + ", " +
"absoluteCapacity=" + absoluteCapacity + ", " +
"usedResources=" + usedResources.getMemory() + "MB, " +
"usedCapacity=" + getUsedCapacity() + ", " +
"utilization=" + getUtilization() + ", " +
"numApps=" + getNumApplications() + ", " +
"numContainers=" + getNumContainers();
}
private synchronized User getUser(String userName) {
@ -731,12 +720,11 @@ public class LeafQueue implements CSQueue {
if(LOG.isDebugEnabled()) {
LOG.debug("pre-assignContainers for application "
+ application.getApplicationId());
}
application.showRequests();
}
synchronized (application) {
computeAndSetUserResourceLimit(application, clusterResource);
// Schedule in priority order
for (Priority priority : application.getPriorities()) {
// Required resource
Resource required =
@ -747,15 +735,21 @@ public class LeafQueue implements CSQueue {
continue;
}
// Are we going over limits by allocating to this application?
// Maximum Capacity of the queue
// Compute & set headroom
// Note: We set the headroom with the highest priority request
// as the target.
// This works since we never assign lower priority requests
// before all higher priority ones are serviced.
Resource userLimit =
computeAndSetUserResourceLimit(application, clusterResource,
required);
// Check queue max-capacity limit
if (!assignToQueue(clusterResource, required)) {
return NULL_ASSIGNMENT;
}
// User limits
Resource userLimit =
computeUserLimit(application, clusterResource, required);
// Check user limit
if (!assignToUser(application.getUser(), userLimit)) {
break;
}
@ -830,25 +824,28 @@ public class LeafQueue implements CSQueue {
float potentialNewCapacity =
(float)(usedResources.getMemory() + required.getMemory()) /
clusterResource.getMemory();
if (potentialNewCapacity > absoluteMaxCapacity) {
LOG.info(getQueueName() +
" usedResources: " + usedResources.getMemory() +
" clusterResources: " + clusterResource.getMemory() +
" currentCapacity " + ((float)usedResources.getMemory())/clusterResource.getMemory() +
" required " + required.getMemory() +
" potentialNewCapacity: " + potentialNewCapacity + " ( " +
" max-capacity: " + absoluteMaxCapacity + ")");
if (potentialNewCapacity > absoluteMaxCapacity) {
return false;
}
return true;
}
private void computeAndSetUserResourceLimit(SchedulerApp application,
Resource clusterResource) {
Resource userLimit =
computeUserLimit(application, clusterResource, Resources.none());
application.setAvailableResourceLimit(userLimit);
metrics.setAvailableResourcesToUser(application.getUser(),
application.getHeadroom());
private Resource computeAndSetUserResourceLimit(SchedulerApp application,
Resource clusterResource, Resource required) {
String user = application.getUser();
Resource limit = computeUserLimit(application, clusterResource, required);
Resource headroom =
Resources.subtract(limit, getUser(user).getConsumedResources());
application.setHeadroom(headroom);
metrics.setAvailableResourcesToUser(user, headroom);
return limit;
}
private int roundUp(int memory) {
@ -919,7 +916,7 @@ public class LeafQueue implements CSQueue {
User user = getUser(userName);
// Note: We aren't considering the current request since there is a fixed
// overhead of the AM, but it's a >= check, so...
// overhead of the AM, but it's a > check, not a >= check, so...
if ((user.getConsumedResources().getMemory()) > limit.getMemory()) {
if (LOG.isDebugEnabled()) {
LOG.debug("User " + userName + " in queue " + getQueueName() +
@ -1237,8 +1234,8 @@ public class LeafQueue implements CSQueue {
// happen under scheduler's lock...
// So, this is, in effect, a transaction across application & node
if (rmContainer.getState() == RMContainerState.RESERVED) {
application.unreserve(node, rmContainer.getReservedPriority());
node.unreserveResource(application);
unreserve(application, rmContainer.getReservedPriority(),
node, rmContainer);
} else {
application.containerCompleted(rmContainer, containerStatus, event);
node.releaseContainer(container);
@ -1303,23 +1300,24 @@ public class LeafQueue implements CSQueue {
public synchronized void updateClusterResource(Resource clusterResource) {
// Update queue properties
maxActiveApplications =
computeMaxActiveApplications(clusterResource, maxAMResourcePercent,
CSQueueUtils.computeMaxActiveApplications(clusterResource, maxAMResourcePercent,
absoluteCapacity);
maxActiveApplicationsPerUser =
computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
userLimitFactor);
// Update application properties
for (SchedulerApp application : activeApplications) {
computeAndSetUserResourceLimit(application, clusterResource);
computeAndSetUserResourceLimit(
application, clusterResource, Resources.none());
}
}
private synchronized void updateResource(Resource clusterResource) {
float queueLimit = clusterResource.getMemory() * absoluteCapacity;
setUtilization(usedResources.getMemory() / queueLimit);
setUsedCapacity(
usedResources.getMemory() / (clusterResource.getMemory() * capacity));
setUsedCapacity(usedResources.getMemory()
/ (clusterResource.getMemory() * parent.getAbsoluteCapacity()));
Resource resourceLimit =
Resources.createResource(roundUp((int)queueLimit));

View File

@ -118,16 +118,14 @@ public class ParentQueue implements CSQueue {
}
float capacity = (float) rawCapacity / 100;
float parentAbsoluteCapacity =
(parent == null) ? 1.0f : parent.getAbsoluteCapacity();
(rootQueue) ? 1.0f : parent.getAbsoluteCapacity();
float absoluteCapacity = parentAbsoluteCapacity * capacity;
float maximumCapacity =
(float) cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
float absoluteMaxCapacity =
(Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ?
Float.MAX_VALUE : (parentAbsoluteCapacity * maximumCapacity);
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
QueueState state = cs.getConfiguration().getState(getQueuePath());
@ -333,10 +331,15 @@ public class ParentQueue implements CSQueue {
}
public String toString() {
return queueName + ":" + capacity + ":" + absoluteCapacity + ":" +
getUsedCapacity() + ":" + getUtilization() + ":" +
getNumApplications() + ":" + getNumContainers() + ":" +
childQueues.size() + " child-queues";
return queueName + ": " +
"numChildQueue= " + childQueues.size() + ", " +
"capacity=" + capacity + ", " +
"absoluteCapacity=" + absoluteCapacity + ", " +
"usedResources=" + usedResources.getMemory() + "MB, " +
"usedCapacity=" + getUsedCapacity() + ", " +
"utilization=" + getUtilization() + ", " +
"numApps=" + getNumApplications() + ", " +
"numContainers=" + getNumContainers();
}
@Override
@ -492,12 +495,8 @@ public class ParentQueue implements CSQueue {
CSQueueUtils.checkMaxCapacity(getQueueName(), capacity, maximumCapacity);
this.maximumCapacity = maximumCapacity;
float parentAbsoluteCapacity =
(rootQueue) ? 100.0f : parent.getAbsoluteCapacity();
this.absoluteMaxCapacity =
(maximumCapacity == CapacitySchedulerConfiguration.UNDEFINED) ?
Float.MAX_VALUE :
(parentAbsoluteCapacity * maximumCapacity);
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
}
@Override
@ -689,9 +688,11 @@ public class ParentQueue implements CSQueue {
private synchronized void updateResource(Resource clusterResource) {
float queueLimit = clusterResource.getMemory() * absoluteCapacity;
float parentAbsoluteCapacity =
(rootQueue) ? 1.0f : parent.getAbsoluteCapacity();
setUtilization(usedResources.getMemory() / queueLimit);
setUsedCapacity(
usedResources.getMemory() / (clusterResource.getMemory() * capacity));
setUsedCapacity(usedResources.getMemory()
/ (clusterResource.getMemory() * parentAbsoluteCapacity));
Resource resourceLimit =
Resources.createResource((int)queueLimit);

View File

@ -358,7 +358,7 @@ public class FifoScheduler implements ResourceScheduler {
}
}
application.setAvailableResourceLimit(clusterResource);
application.setHeadroom(clusterResource);
LOG.debug("post-assignContainers");
application.showRequests();

View File

@ -21,16 +21,24 @@ import static org.junit.Assert.*;
import static org.mockito.Mockito.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.QueueACL;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApp;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
import org.junit.After;
@ -283,38 +291,76 @@ public class TestApplicationLimits {
final String user_0 = "user_0";
final String user_1 = "user_1";
int APPLICATION_ID = 0;
RecordFactory recordFactory =
RecordFactoryProvider.getRecordFactory(null);
RMContext rmContext = TestUtils.getMockRMContext();
// Submit first application from user_0, check headroom
SchedulerApp app_0_0 = getMockApplication(APPLICATION_ID++, user_0);
Priority priority_1 = TestUtils.createMockPriority(1);
// Submit first application with some resource-requests from user_0,
// and check headroom
final ApplicationAttemptId appAttemptId_0_0 =
TestUtils.getMockApplicationAttemptId(0, 0);
SchedulerApp app_0_0 =
spy(new SchedulerApp(appAttemptId_0_0, user_0, queue, rmContext, null));
queue.submitApplication(app_0_0, user_0, A);
queue.assignContainers(clusterResource, node_0); // Schedule to compute
List<ResourceRequest> app_0_0_requests = new ArrayList<ResourceRequest>();
app_0_0_requests.add(
TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2,
priority_1, recordFactory));
app_0_0.updateResourceRequests(app_0_0_requests);
// Schedule to compute
queue.assignContainers(clusterResource, node_0);
Resource expectedHeadroom = Resources.createResource(10*16*GB);
verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_0_0).setHeadroom(eq(expectedHeadroom));
// Submit second application from user_0, check headroom
SchedulerApp app_0_1 = getMockApplication(APPLICATION_ID++, user_0);
final ApplicationAttemptId appAttemptId_0_1 =
TestUtils.getMockApplicationAttemptId(1, 0);
SchedulerApp app_0_1 =
spy(new SchedulerApp(appAttemptId_0_1, user_0, queue, rmContext, null));
queue.submitApplication(app_0_1, user_0, A);
List<ResourceRequest> app_0_1_requests = new ArrayList<ResourceRequest>();
app_0_1_requests.add(
TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2,
priority_1, recordFactory));
app_0_1.updateResourceRequests(app_0_1_requests);
// Schedule to compute
queue.assignContainers(clusterResource, node_0); // Schedule to compute
verify(app_0_0, times(2)).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));// no change
verify(app_0_0, times(2)).setHeadroom(eq(expectedHeadroom));
verify(app_0_1).setHeadroom(eq(expectedHeadroom));// no change
// Submit first application from user_1, check for new headroom
SchedulerApp app_1_0 = getMockApplication(APPLICATION_ID++, user_1);
final ApplicationAttemptId appAttemptId_1_0 =
TestUtils.getMockApplicationAttemptId(2, 0);
SchedulerApp app_1_0 =
spy(new SchedulerApp(appAttemptId_1_0, user_1, queue, rmContext, null));
queue.submitApplication(app_1_0, user_1, A);
List<ResourceRequest> app_1_0_requests = new ArrayList<ResourceRequest>();
app_1_0_requests.add(
TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2,
priority_1, recordFactory));
app_1_0.updateResourceRequests(app_1_0_requests);
// Schedule to compute
queue.assignContainers(clusterResource, node_0); // Schedule to compute
expectedHeadroom = Resources.createResource(10*16*GB / 2); // changes
verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_0_0).setHeadroom(eq(expectedHeadroom));
verify(app_0_1).setHeadroom(eq(expectedHeadroom));
verify(app_1_0).setHeadroom(eq(expectedHeadroom));
// Now reduce cluster size and check for the smaller headroom
clusterResource = Resources.createResource(90*16*GB);
queue.assignContainers(clusterResource, node_0); // Schedule to compute
expectedHeadroom = Resources.createResource(9*16*GB / 2); // changes
verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
verify(app_0_0).setHeadroom(eq(expectedHeadroom));
verify(app_0_1).setHeadroom(eq(expectedHeadroom));
verify(app_1_0).setHeadroom(eq(expectedHeadroom));
}

View File

@ -255,7 +255,7 @@ public class TestLeafQueue {
// Manipulate queue 'a'
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
//unset maxCapacity
a.setMaxCapacity(-0.01f);
a.setMaxCapacity(1.0f);
// Users
final String user_0 = "user_0";
@ -377,7 +377,7 @@ public class TestLeafQueue {
// Mock the queue
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
//unset maxCapacity
a.setMaxCapacity(-0.01f);
a.setMaxCapacity(1.0f);
// Users
final String user_0 = "user_0";
@ -491,7 +491,7 @@ public class TestLeafQueue {
// Revert max-capacity and user-limit-factor
// Now, allocations should goto app_3 since it's under user-limit
a.setMaxCapacity(-0.01f);
a.setMaxCapacity(1.0f);
a.setUserLimitFactor(1);
a.assignContainers(clusterResource, node_0);
assertEquals(7*GB, a.getUsedResources().getMemory());
@ -548,7 +548,7 @@ public class TestLeafQueue {
// Manipulate queue 'a'
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
//unset maxCapacity
a.setMaxCapacity(-0.01f);
a.setMaxCapacity(1.0f);
// Users
final String user_0 = "user_0";
@ -571,7 +571,7 @@ public class TestLeafQueue {
String host_0 = "host_0";
SchedulerNode node_0 = TestUtils.getMockNode(host_0, DEFAULT_RACK, 0, 4*GB);
final int numNodes = 1;
final int numNodes = 2;
Resource clusterResource = Resources.createResource(numNodes * (4*GB));
when(csContext.getNumClusterNodes()).thenReturn(numNodes);
@ -646,7 +646,7 @@ public class TestLeafQueue {
// Manipulate queue 'a'
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
//unset maxCapacity
a.setMaxCapacity(-0.01f);
a.setMaxCapacity(1.0f);
a.setUserLimitFactor(10);
// Users
@ -673,7 +673,7 @@ public class TestLeafQueue {
String host_1 = "host_1";
SchedulerNode node_1 = TestUtils.getMockNode(host_1, DEFAULT_RACK, 0, 4*GB);
final int numNodes = 2;
final int numNodes = 3;
Resource clusterResource = Resources.createResource(numNodes * (4*GB));
when(csContext.getNumClusterNodes()).thenReturn(numNodes);
when(csContext.getMaximumResourceCapability()).thenReturn(

View File

@ -138,12 +138,34 @@ public class TestParentQueue {
when(queue).assignContainers(eq(clusterResource), eq(node));
}
private float computeQueueUsedCapacity(CSQueue queue,
int expectedMemory, Resource clusterResource) {
return (
((float)expectedMemory / clusterResource.getMemory()) *
queue.getParent().getAbsoluteCapacity()
);
}
private float computeQueueUtilization(CSQueue queue,
int expectedMemory, Resource clusterResource) {
return (expectedMemory /
(clusterResource.getMemory() * queue.getAbsoluteCapacity()));
}
final static float DELTA = 0.0001f;
private void verifyQueueMetrics(CSQueue queue,
int expectedMemory, Resource clusterResource) {
assertEquals(
computeQueueUtilization(queue, expectedMemory, clusterResource),
queue.getUtilization(),
DELTA);
assertEquals(
computeQueueUsedCapacity(queue, expectedMemory, clusterResource),
queue.getUsedCapacity(),
DELTA);
}
@Test
public void testSingleLevelQueues() throws Exception {
// Setup queue configs
@ -173,15 +195,13 @@ public class TestParentQueue {
// Start testing
LeafQueue a = (LeafQueue)queues.get(A);
LeafQueue b = (LeafQueue)queues.get(B);
final float delta = 0.0001f;
// Simulate B returning a container on node_0
stubQueueAllocation(a, clusterResource, node_0, 0*GB);
stubQueueAllocation(b, clusterResource, node_0, 1*GB);
root.assignContainers(clusterResource, node_0);
assertEquals(0.0f, a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 1*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 0*GB, clusterResource);
verifyQueueMetrics(b, 1*GB, clusterResource);
// Now, A should get the scheduling opportunity since A=0G/6G, B=1G/14G
stubQueueAllocation(a, clusterResource, node_1, 2*GB);
@ -192,10 +212,8 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(b).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 2*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 2*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 2*GB, clusterResource);
verifyQueueMetrics(b, 2*GB, clusterResource);
// Now, B should get the scheduling opportunity
// since A has 2/6G while B has 2/14G
@ -207,10 +225,8 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(a).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 3*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 4*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 3*GB, clusterResource);
verifyQueueMetrics(b, 4*GB, clusterResource);
// Now, B should still get the scheduling opportunity
// since A has 3/6G while B has 4/14G
@ -222,10 +238,8 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(a).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 3*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 8*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 3*GB, clusterResource);
verifyQueueMetrics(b, 8*GB, clusterResource);
// Now, A should get the scheduling opportunity
// since A has 3/6G while B has 8/14G
@ -237,10 +251,8 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(a).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 4*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 9*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 4*GB, clusterResource);
verifyQueueMetrics(b, 9*GB, clusterResource);
}
private static final String C = "c";
@ -323,22 +335,16 @@ public class TestParentQueue {
CSQueue b2 = queues.get(B2);
CSQueue b3 = queues.get(B3);
final float delta = 0.0001f;
// Simulate C returning a container on node_0
stubQueueAllocation(a, clusterResource, node_0, 0*GB);
stubQueueAllocation(b, clusterResource, node_0, 0*GB);
stubQueueAllocation(c, clusterResource, node_0, 1*GB);
stubQueueAllocation(d, clusterResource, node_0, 0*GB);
root.assignContainers(clusterResource, node_0);
assertEquals(computeQueueUtilization(a, 0*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 0*GB, clusterResource),
b.getUtilization(), delta);
assertEquals(computeQueueUtilization(c, 1*GB, clusterResource),
c.getUtilization(), delta);
assertEquals(computeQueueUtilization(d, 0*GB, clusterResource),
d.getUtilization(), delta);
verifyQueueMetrics(a, 0*GB, clusterResource);
verifyQueueMetrics(b, 0*GB, clusterResource);
verifyQueueMetrics(c, 1*GB, clusterResource);
verifyQueueMetrics(d, 0*GB, clusterResource);
reset(a); reset(b); reset(c);
// Now get B2 to allocate
@ -347,12 +353,9 @@ public class TestParentQueue {
stubQueueAllocation(b2, clusterResource, node_1, 4*GB);
stubQueueAllocation(c, clusterResource, node_1, 0*GB);
root.assignContainers(clusterResource, node_1);
assertEquals(computeQueueUtilization(a, 0*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 4*GB, clusterResource),
b.getUtilization(), delta);
assertEquals(computeQueueUtilization(c, 1*GB, clusterResource),
c.getUtilization(), delta);
verifyQueueMetrics(a, 0*GB, clusterResource);
verifyQueueMetrics(b, 4*GB, clusterResource);
verifyQueueMetrics(c, 1*GB, clusterResource);
reset(a); reset(b); reset(c);
// Now get both A1, C & B3 to allocate in right order
@ -368,12 +371,9 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(b).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 1*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 6*GB, clusterResource),
b.getUtilization(), delta);
assertEquals(computeQueueUtilization(c, 3*GB, clusterResource),
c.getUtilization(), delta);
verifyQueueMetrics(a, 1*GB, clusterResource);
verifyQueueMetrics(b, 6*GB, clusterResource);
verifyQueueMetrics(c, 3*GB, clusterResource);
reset(a); reset(b); reset(c);
// Now verify max-capacity
@ -399,14 +399,10 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(c).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 3*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 8*GB, clusterResource),
b.getUtilization(), delta);
assertEquals(computeQueueUtilization(c, 4*GB, clusterResource),
c.getUtilization(), delta);
verifyQueueMetrics(a, 3*GB, clusterResource);
verifyQueueMetrics(b, 8*GB, clusterResource);
verifyQueueMetrics(c, 4*GB, clusterResource);
reset(a); reset(b); reset(c);
}
@Test
@ -438,15 +434,13 @@ public class TestParentQueue {
// Start testing
LeafQueue a = (LeafQueue)queues.get(A);
LeafQueue b = (LeafQueue)queues.get(B);
final float delta = 0.0001f;
// Simulate B returning a container on node_0
stubQueueAllocation(a, clusterResource, node_0, 0*GB, NodeType.OFF_SWITCH);
stubQueueAllocation(b, clusterResource, node_0, 1*GB, NodeType.OFF_SWITCH);
root.assignContainers(clusterResource, node_0);
assertEquals(0.0f, a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 1*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 0*GB, clusterResource);
verifyQueueMetrics(b, 1*GB, clusterResource);
// Now, A should get the scheduling opportunity since A=0G/6G, B=1G/14G
// also, B gets a scheduling opportunity since A allocates RACK_LOCAL
@ -458,10 +452,8 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(b).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 2*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 2*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 2*GB, clusterResource);
verifyQueueMetrics(b, 2*GB, clusterResource);
// Now, B should get the scheduling opportunity
// since A has 2/6G while B has 2/14G,
@ -474,10 +466,8 @@ public class TestParentQueue {
any(SchedulerNode.class));
allocationOrder.verify(a).assignContainers(eq(clusterResource),
any(SchedulerNode.class));
assertEquals(computeQueueUtilization(a, 2*GB, clusterResource),
a.getUtilization(), delta);
assertEquals(computeQueueUtilization(b, 4*GB, clusterResource),
b.getUtilization(), delta);
verifyQueueMetrics(a, 2*GB, clusterResource);
verifyQueueMetrics(b, 4*GB, clusterResource);
}

View File

@ -30,6 +30,8 @@ public class TestQueueParsing {
private static final Log LOG = LogFactory.getLog(TestQueueParsing.class);
private static final double DELTA = 0.000001;
@Test
public void testQueueParsing() throws Exception {
CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
@ -37,6 +39,20 @@ public class TestQueueParsing {
CapacityScheduler capacityScheduler = new CapacityScheduler();
capacityScheduler.reinitialize(conf, null, null);
CSQueue a = capacityScheduler.getQueue("a");
Assert.assertEquals(0.10, a.getAbsoluteCapacity(), DELTA);
Assert.assertEquals(0.15, a.getAbsoluteMaximumCapacity(), DELTA);
CSQueue b1 = capacityScheduler.getQueue("b1");
Assert.assertEquals(0.2 * 0.5, b1.getAbsoluteCapacity(), DELTA);
Assert.assertEquals("Parent B has no MAX_CAP",
0.85, b1.getAbsoluteMaximumCapacity(), DELTA);
CSQueue c12 = capacityScheduler.getQueue("c12");
Assert.assertEquals(0.7 * 0.5 * 0.45, c12.getAbsoluteCapacity(), DELTA);
Assert.assertEquals(0.7 * 0.55 * 0.7,
c12.getAbsoluteMaximumCapacity(), DELTA);
}
private void setupQueueConfiguration(CapacitySchedulerConfiguration conf) {
@ -47,12 +63,14 @@ public class TestQueueParsing {
final String A = CapacitySchedulerConfiguration.ROOT + ".a";
conf.setCapacity(A, 10);
conf.setMaximumCapacity(A, 15);
final String B = CapacitySchedulerConfiguration.ROOT + ".b";
conf.setCapacity(B, 20);
final String C = CapacitySchedulerConfiguration.ROOT + ".c";
conf.setCapacity(C, 70);
conf.setMaximumCapacity(C, 70);
LOG.info("Setup top-level queues");
@ -61,15 +79,20 @@ public class TestQueueParsing {
final String A2 = A + ".a2";
conf.setQueues(A, new String[] {"a1", "a2"});
conf.setCapacity(A1, 30);
conf.setMaximumCapacity(A1, 45);
conf.setCapacity(A2, 70);
conf.setMaximumCapacity(A2, 85);
final String B1 = B + ".b1";
final String B2 = B + ".b2";
final String B3 = B + ".b3";
conf.setQueues(B, new String[] {"b1", "b2", "b3"});
conf.setCapacity(B1, 50);
conf.setMaximumCapacity(B1, 85);
conf.setCapacity(B2, 30);
conf.setMaximumCapacity(B2, 35);
conf.setCapacity(B3, 20);
conf.setMaximumCapacity(B3, 35);
final String C1 = C + ".c1";
final String C2 = C + ".c2";
@ -77,9 +100,13 @@ public class TestQueueParsing {
final String C4 = C + ".c4";
conf.setQueues(C, new String[] {"c1", "c2", "c3", "c4"});
conf.setCapacity(C1, 50);
conf.setMaximumCapacity(C1, 55);
conf.setCapacity(C2, 10);
conf.setMaximumCapacity(C2, 25);
conf.setCapacity(C3, 35);
conf.setMaximumCapacity(C3, 38);
conf.setCapacity(C4, 5);
conf.setMaximumCapacity(C4, 5);
LOG.info("Setup 2nd-level queues");
@ -89,8 +116,11 @@ public class TestQueueParsing {
final String C13 = C1 + ".c13";
conf.setQueues(C1, new String[] {"c11", "c12", "c13"});
conf.setCapacity(C11, 15);
conf.setMaximumCapacity(C11, 30);
conf.setCapacity(C12, 45);
conf.setMaximumCapacity(C12, 70);
conf.setCapacity(C13, 40);
conf.setMaximumCapacity(C13, 40);
LOG.info("Setup 3rd-level queues");
}

View File

@ -235,12 +235,13 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
Element qElem = (Element) queues.item(j);
String qName = WebServicesTestUtils.getXmlString(qElem, "queueName");
String q = CapacitySchedulerConfiguration.ROOT + "." + qName;
verifySubQueueXML(qElem, q, 100);
verifySubQueueXML(qElem, q, 100, 100);
}
}
}
public void verifySubQueueXML(Element qElem, String q, float parentAbsCapacity)
public void verifySubQueueXML(Element qElem, String q,
float parentAbsCapacity, float parentAbsMaxCapacity)
throws Exception {
NodeList queues = qElem.getElementsByTagName("subQueues");
QueueInfo qi = (queues != null) ? new QueueInfo() : new LeafQueueInfo();
@ -258,14 +259,15 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
WebServicesTestUtils.getXmlString(qElem, "usedResources");
qi.queueName = WebServicesTestUtils.getXmlString(qElem, "queueName");
qi.state = WebServicesTestUtils.getXmlString(qElem, "state");
verifySubQueueGeneric(q, qi, parentAbsCapacity);
verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
if (queues != null) {
for (int j = 0; j < queues.getLength(); j++) {
Element subqElem = (Element) queues.item(j);
String qName = WebServicesTestUtils.getXmlString(subqElem, "queueName");
String q2 = q + "." + qName;
verifySubQueueXML(subqElem, q2, qi.absoluteCapacity);
verifySubQueueXML(subqElem, q2,
qi.absoluteCapacity, qi.absoluteMaxCapacity);
}
} else {
LeafQueueInfo lqi = (LeafQueueInfo) qi;
@ -309,7 +311,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
for (int i = 0; i < arr.length(); i++) {
JSONObject obj = arr.getJSONObject(i);
String q = CapacitySchedulerConfiguration.ROOT + "." + obj.getString("queueName");
verifySubQueue(obj, q, 100);
verifySubQueue(obj, q, 100, 100);
}
}
@ -323,7 +325,8 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
assertTrue("queueName doesn't match", "root".matches(queueName));
}
private void verifySubQueue(JSONObject info, String q, float parentAbsCapacity)
private void verifySubQueue(JSONObject info, String q,
float parentAbsCapacity, float parentAbsMaxCapacity)
throws JSONException, Exception {
int numExpectedElements = 11;
boolean isParentQueue = true;
@ -345,7 +348,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
qi.queueName = info.getString("queueName");
qi.state = info.getString("state");
verifySubQueueGeneric(q, qi, parentAbsCapacity);
verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
if (isParentQueue) {
JSONArray arr = info.getJSONArray("subQueues");
@ -353,7 +356,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
for (int i = 0; i < arr.length(); i++) {
JSONObject obj = arr.getJSONObject(i);
String q2 = q + "." + obj.getString("queueName");
verifySubQueue(obj, q2, qi.absoluteCapacity);
verifySubQueue(obj, q2, qi.absoluteCapacity, qi.absoluteMaxCapacity);
}
} else {
LeafQueueInfo lqi = (LeafQueueInfo) qi;
@ -371,7 +374,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
}
private void verifySubQueueGeneric(String q, QueueInfo info,
float parentAbsCapacity) throws Exception {
float parentAbsCapacity, float parentAbsMaxCapacity) throws Exception {
String[] qArr = q.split("\\.");
assertTrue("q name invalid: " + q, qArr.length > 1);
String qshortName = qArr[qArr.length - 1];
@ -380,7 +383,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
assertEquals("capacity doesn't match", csConf.getCapacity(q),
info.capacity, 1e-3f);
float expectCapacity = csConf.getMaximumCapacity(q);
float expectAbsMaxCapacity = parentAbsCapacity * (info.maxCapacity/100);
float expectAbsMaxCapacity = parentAbsMaxCapacity * (info.maxCapacity/100);
if (CapacitySchedulerConfiguration.UNDEFINED == expectCapacity) {
expectCapacity = 100;
expectAbsMaxCapacity = 100;

View File

@ -57,7 +57,7 @@ public class AmIpFilter implements Filter {
proxyUriBase = conf.getInitParameter(PROXY_URI_BASE);
}
private Set<String> getProxyAddresses() throws ServletException {
protected Set<String> getProxyAddresses() throws ServletException {
long now = System.currentTimeMillis();
synchronized(this) {
if(proxyAddresses == null || (lastUpdate + updateInterval) >= now) {
@ -97,12 +97,15 @@ public class AmIpFilter implements Filter {
}
String user = null;
if (httpReq.getCookies() != null) {
for(Cookie c: httpReq.getCookies()) {
if(WebAppProxyServlet.PROXY_USER_COOKIE_NAME.equals(c.getName())){
user = c.getValue();
break;
}
}
}
if(user == null) {
LOG.warn("Could not find "+WebAppProxyServlet.PROXY_USER_COOKIE_NAME
+" cookie, so user will not be set");

View File

@ -0,0 +1,121 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.webproxy.amfilter;
import java.io.IOException;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import junit.framework.Assert;
import org.junit.Test;
import org.mockito.Mockito;
public class TestAmFilter {
private String proxyHost = "bogushost.com";
private String proxyUri = "http://bogus";
private class TestAmIpFilter extends AmIpFilter {
private Set<String> proxyAddresses = null;
protected Set<String> getProxyAddresses() {
if(proxyAddresses == null) {
proxyAddresses = new HashSet<String>();
}
proxyAddresses.add(proxyHost);
return proxyAddresses;
}
}
private static class DummyFilterConfig implements FilterConfig {
final Map<String, String> map;
DummyFilterConfig(Map<String,String> map) {
this.map = map;
}
@Override
public String getFilterName() {
return "dummy";
}
@Override
public String getInitParameter(String arg0) {
return map.get(arg0);
}
@Override
public Enumeration<String> getInitParameterNames() {
return Collections.enumeration(map.keySet());
}
@Override
public ServletContext getServletContext() {
return null;
}
}
@Test
public void filterNullCookies() throws Exception {
HttpServletRequest request = Mockito.mock(HttpServletRequest.class);
Mockito.when(request.getCookies()).thenReturn(null);
Mockito.when(request.getRemoteAddr()).thenReturn(proxyHost);
HttpServletResponse response = Mockito.mock(HttpServletResponse.class);
final AtomicBoolean invoked = new AtomicBoolean();
FilterChain chain = new FilterChain() {
@Override
public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse)
throws IOException, ServletException {
invoked.set(true);
}
};
Map<String, String> params = new HashMap<String, String>();
params.put(AmIpFilter.PROXY_HOST, proxyHost);
params.put(AmIpFilter.PROXY_URI_BASE, proxyUri);
FilterConfig conf = new DummyFilterConfig(params);
Filter filter = new TestAmIpFilter();
filter.init(conf);
filter.doFilter(request, response, chain);
Assert.assertTrue(invoked.get());
filter.destroy();
}
}

View File

@ -95,7 +95,7 @@ Hadoop MapReduce Next Generation - Cluster Setup
*--------------------------------------+--------------------------------------+
| DataNode | HADOOP_DATANODE_OPTS |
*--------------------------------------+--------------------------------------+
| Backup NameNode | HADOOP_SECONDARYNAMENODE_OPTS |
| Secondary NameNode | HADOOP_SECONDARYNAMENODE_OPTS |
*--------------------------------------+--------------------------------------+
| ResourceManager | YARN_RESOURCEMANAGER_OPTS |
*--------------------------------------+--------------------------------------+
@ -537,15 +537,15 @@ Hadoop MapReduce Next Generation - Cluster Setup
It's recommended to have them share a Unix group, for e.g. <<<hadoop>>>.
*--------------------------------------+--------------------------------------+
*--------------------------------------+----------------------------------------------------------------------+
|| User:Group || Daemons |
*--------------------------------------+--------------------------------------+
| hdfs:hadoop | NameNode, Backup NameNode, DataNode |
*--------------------------------------+--------------------------------------+
*--------------------------------------+----------------------------------------------------------------------+
| hdfs:hadoop | NameNode, Secondary NameNode, Checkpoint Node, Backup Node, DataNode |
*--------------------------------------+----------------------------------------------------------------------+
| yarn:hadoop | ResourceManager, NodeManager |
*--------------------------------------+--------------------------------------+
*--------------------------------------+----------------------------------------------------------------------+
| mapred:hadoop | MapReduce JobHistory Server |
*--------------------------------------+--------------------------------------+
*--------------------------------------+----------------------------------------------------------------------+
* <<<Permissions for both HDFS and local fileSystem paths>>>

View File

@ -0,0 +1,49 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
YARN
---
---
${maven.build.timestamp}
Web Application Proxy
The Web Application Proxy is part of YARN. By default it will run as part of
the Resource Manager(RM), but can be configured to run in stand alone mode.
The reason for the proxy is to reduce the possibility of web based attacks
through YARN.
In YARN the Application Master(AM) has the responsibility to provide a web UI
and to send that link to the RM. This opens up a number of potential
issues. The RM runs as a trusted user, and people visiting that web
address will treat it, and links it provides to them as trusted, when in
reality the AM is running as a non-trusted user, and the links it gives to
the RM could point to anything malicious or otherwise. The Web Application
Proxy mitigates this risk by warning users that do not own the given
application that they are connecting to an untrusted site.
In addition to this the proxy also tries to reduce the impact that a malicious
AM could have on a user. It primarily does this by stripping out cookies from
the user, and replacing them with a single cookie providing the user name of
the logged in user. This is because most web based authentication systems will
identify a user based off of a cookie. By providing this cookie to an
untrusted application it opens up the potential for an exploit. If the cookie
is designed properly that potential should be fairly minimal, but this is just
to reduce that potential attack vector. The current proxy implementation does
nothing to prevent the AM from providing links to malicious external sites,
nor does it do anything to prevent malicious javascript code from running as
well. In fact javascript can be used to get the cookies, so stripping the
cookies from the request has minimal benefit at this time.
In the future we hope to address the attack vectors described above and make
attaching to an AM's web UI safer.

View File

@ -47,4 +47,6 @@ MapReduce NextGen aka YARN aka MRv2
* {{{./CapacityScheduler.html}Capacity Scheduler}}
* {{{./WebApplicationProxy.html}Web Application Proxy}}

View File

@ -223,6 +223,11 @@
<artifactId>hadoop-archives</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-rumen</artifactId>
@ -709,11 +714,21 @@
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>2.4</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.2</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pdf-plugin</artifactId>
<version>1.1</version>
</plugin>
</plugins>
</pluginManagement>
@ -811,6 +826,14 @@
</excludes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pdf-plugin</artifactId>
<configuration>
<outputDirectory>${project.reporting.outputDirectory}</outputDirectory>
<includeReports>false</includeReports>
</configuration>
</plugin>
</plugins>
</build>

View File

@ -61,6 +61,7 @@
<item name="YARN Architecture" href="hadoop-yarn/hadoop-yarn-site/YARN.html"/>
<item name="Writing Yarn Applications" href="hadoop-yarn/hadoop-yarn-site/WritingYarnApplications.html"/>
<item name="Capacity Scheduler" href="hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html"/>
<item name="Web Application Proxy" href="hadoop-yarn/hadoop-yarn-site/WebApplicationProxy.html"/>
</menu>
<menu name="YARN REST API's" inherit="top">

View File

@ -0,0 +1,7 @@
DistCp (distributed copy) is a tool used for large inter/intra-cluster copying.
It uses Map/Reduce to effect its distribution, error handling and recovery,
and reporting. It expands a list of files and directories into input to map tasks,
each of which will copy a partition of the files specified in the source list.
Version 0.1 (2010/08/02 sriksun)
- Initial Version

View File

@ -0,0 +1,198 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<project>
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-project</artifactId>
<version>0.23.1-SNAPSHOT</version>
<relativePath>../../hadoop-project</relativePath>
</parent>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>
<version>0.23.1-SNAPSHOT</version>
<description>Apache Hadoop Distributed Copy</description>
<name>Apache Hadoop Distributed Copy</name>
<packaging>jar</packaging>
<properties>
<file.encoding>UTF-8</file.encoding>
<downloadSources>true</downloadSources>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-annotations</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-app</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-hs</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>test</scope>
<type>test-jar</type>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<filtering>true</filtering>
</resource>
</resources>
<testResources>
<testResource>
<directory>src/test/resources</directory>
<filtering>true</filtering>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<forkMode>always</forkMode>
<forkedProcessTimeoutInSeconds>600</forkedProcessTimeoutInSeconds>
<argLine>-Xmx1024m</argLine>
<includes>
<include>**/Test*.java</include>
</includes>
<redirectTestOutputToFile>true</redirectTestOutputToFile>
<systemProperties>
<property>
<name>test.build.data</name>
<value>${basedir}/target/test/data</value>
</property>
<property>
<name>hadoop.log.dir</name>
<value>target/test/logs</value>
</property>
<property>
<name>org.apache.commons.logging.Log</name>
<value>org.apache.commons.logging.impl.SimpleLog</value>
</property>
<property>
<name>org.apache.commons.logging.simplelog.defaultlog</name>
<value>warn</value>
</property>
</systemProperties>
</configuration>
</plugin>
<plugin>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<configuration>
<enableRulesSummary>true</enableRulesSummary>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>org.apache.hadoop.tools.DistCp</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<configuration>
<attach>true</attach>
</configuration>
<executions>
<execution>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pdf-plugin</artifactId>
<executions>
<execution>
<id>pdf</id>
<phase>package</phase>
<goals>
<goal>pdf</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,218 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.security.Credentials;
import java.io.IOException;
/**
* The CopyListing abstraction is responsible for how the list of
* sources and targets is constructed, for DistCp's copy function.
* The copy-listing should be a SequenceFile<Text, FileStatus>,
* located at the path specified to buildListing(),
* each entry being a pair of (Source relative path, source file status),
* all the paths being fully qualified.
*/
public abstract class CopyListing extends Configured {
private Credentials credentials;
/**
* Build listing function creates the input listing that distcp uses to
* perform the copy.
*
* The build listing is a sequence file that has relative path of a file in the key
* and the file status information of the source file in the value
*
* For instance if the source path is /tmp/data and the traversed path is
* /tmp/data/dir1/dir2/file1, then the sequence file would contain
*
* key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
*
* File would also contain directory entries. Meaning, if /tmp/data/dir1/dir2/file1
* is the only file under /tmp/data, the resulting sequence file would contain the
* following entries
*
* key: /dir1 and value: FileStatus(/tmp/data/dir1)
* key: /dir1/dir2 and value: FileStatus(/tmp/data/dir1/dir2)
* key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
*
* Cases requiring special handling:
* If source path is a file (/tmp/file1), contents of the file will be as follows
*
* TARGET DOES NOT EXIST: Key-"", Value-FileStatus(/tmp/file1)
* TARGET IS FILE : Key-"", Value-FileStatus(/tmp/file1)
* TARGET IS DIR : Key-"/file1", Value-FileStatus(/tmp/file1)
*
* @param pathToListFile - Output file where the listing would be stored
* @param options - Input options to distcp
* @throws IOException - Exception if any
*/
public final void buildListing(Path pathToListFile,
DistCpOptions options) throws IOException {
validatePaths(options);
doBuildListing(pathToListFile, options);
Configuration config = getConf();
config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, pathToListFile.toString());
config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, getBytesToCopy());
config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths());
checkForDuplicates(pathToListFile);
}
/**
* Validate input and output paths
*
* @param options - Input options
* @throws InvalidInputException: If inputs are invalid
* @throws IOException: any Exception with FS
*/
protected abstract void validatePaths(DistCpOptions options)
throws IOException, InvalidInputException;
/**
* The interface to be implemented by sub-classes, to create the source/target file listing.
* @param pathToListFile Path on HDFS where the listing file is written.
* @param options Input Options for DistCp (indicating source/target paths.)
* @throws IOException: Thrown on failure to create the listing file.
*/
protected abstract void doBuildListing(Path pathToListFile,
DistCpOptions options) throws IOException;
/**
* Return the total bytes that distCp should copy for the source paths
* This doesn't consider whether file is same should be skipped during copy
*
* @return total bytes to copy
*/
protected abstract long getBytesToCopy();
/**
* Return the total number of paths to distcp, includes directories as well
* This doesn't consider whether file/dir is already present and should be skipped during copy
*
* @return Total number of paths to distcp
*/
protected abstract long getNumberOfPaths();
/**
* Validate the final resulting path listing to see if there are any duplicate entries
*
* @param pathToListFile - path listing build by doBuildListing
* @throws IOException - Any issues while checking for duplicates and throws
* @throws DuplicateFileException - if there are duplicates
*/
private void checkForDuplicates(Path pathToListFile)
throws DuplicateFileException, IOException {
Configuration config = getConf();
FileSystem fs = pathToListFile.getFileSystem(config);
Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile);
SequenceFile.Reader reader = new SequenceFile.Reader(
config, SequenceFile.Reader.file(sortedList));
try {
Text lastKey = new Text("*"); //source relative path can never hold *
FileStatus lastFileStatus = new FileStatus();
Text currentKey = new Text();
while (reader.next(currentKey)) {
if (currentKey.equals(lastKey)) {
FileStatus currentFileStatus = new FileStatus();
reader.getCurrentValue(currentFileStatus);
throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " +
currentFileStatus.getPath() + " would cause duplicates. Aborting");
}
reader.getCurrentValue(lastFileStatus);
lastKey.set(currentKey);
}
} finally {
IOUtils.closeStream(reader);
}
}
/**
* Protected constructor, to initialize configuration.
* @param configuration The input configuration,
* with which the source/target FileSystems may be accessed.
* @param credentials - Credentials object on which the FS delegation tokens are cached.If null
* delegation token caching is skipped
*/
protected CopyListing(Configuration configuration, Credentials credentials) {
setConf(configuration);
setCredentials(credentials);
}
/**
* set Credentials store, on which FS delegatin token will be cached
* @param credentials - Credentials object
*/
protected void setCredentials(Credentials credentials) {
this.credentials = credentials;
}
/**
* get credentials to update the delegation tokens for accessed FS objects
* @return Credentials object
*/
protected Credentials getCredentials() {
return credentials;
}
/**
* Public Factory method with which the appropriate CopyListing implementation may be retrieved.
* @param configuration The input configuration.
* @param credentials Credentials object on which the FS delegation tokens are cached
* @param options The input Options, to help choose the appropriate CopyListing Implementation.
* @return An instance of the appropriate CopyListing implementation.
*/
public static CopyListing getCopyListing(Configuration configuration,
Credentials credentials,
DistCpOptions options) {
if (options.getSourceFileListing() == null) {
return new GlobbedCopyListing(configuration, credentials);
} else {
return new FileBasedCopyListing(configuration, credentials);
}
}
static class DuplicateFileException extends RuntimeException {
public DuplicateFileException(String message) {
super(message);
}
}
static class InvalidInputException extends RuntimeException {
public InvalidInputException(String message) {
super(message);
}
}
}

View File

@ -0,0 +1,405 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
import org.apache.hadoop.mapreduce.Cluster;
import org.apache.hadoop.tools.CopyListing.*;
import org.apache.hadoop.tools.mapred.CopyMapper;
import org.apache.hadoop.tools.mapred.CopyOutputFormat;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
import java.util.Random;
/**
* DistCp is the main driver-class for DistCpV2.
* For command-line use, DistCp::main() orchestrates the parsing of command-line
* parameters and the launch of the DistCp job.
* For programmatic use, a DistCp object can be constructed by specifying
* options (in a DistCpOptions object), and DistCp::execute() may be used to
* launch the copy-job. DistCp may alternatively be sub-classed to fine-tune
* behaviour.
*/
public class DistCp extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(DistCp.class);
private DistCpOptions inputOptions;
private Path metaFolder;
private static final String PREFIX = "_distcp";
private static final String WIP_PREFIX = "._WIP_";
private static final String DISTCP_DEFAULT_XML = "distcp-default.xml";
public static final Random rand = new Random();
private boolean submitted;
private FileSystem jobFS;
/**
* Public Constructor. Creates DistCp object with specified input-parameters.
* (E.g. source-paths, target-location, etc.)
* @param inputOptions Options (indicating source-paths, target-location.)
* @param configuration The Hadoop configuration against which the Copy-mapper must run.
* @throws Exception, on failure.
*/
public DistCp(Configuration configuration, DistCpOptions inputOptions) throws Exception {
Configuration config = new Configuration(configuration);
config.addResource(DISTCP_DEFAULT_XML);
setConf(config);
this.inputOptions = inputOptions;
this.metaFolder = createMetaFolderPath();
}
/**
* To be used with the ToolRunner. Not for public consumption.
*/
private DistCp() {}
/**
* Implementation of Tool::run(). Orchestrates the copy of source file(s)
* to target location, by:
* 1. Creating a list of files to be copied to target.
* 2. Launching a Map-only job to copy the files. (Delegates to execute().)
* @param argv List of arguments passed to DistCp, from the ToolRunner.
* @return On success, it returns 0. Else, -1.
*/
public int run(String[] argv) {
try {
inputOptions = (OptionsParser.parse(argv));
LOG.info("Input Options: " + inputOptions);
} catch (Throwable e) {
LOG.error("Invalid arguments: ", e);
System.err.println("Invalid arguments: " + e.getMessage());
OptionsParser.usage();
return DistCpConstants.INVALID_ARGUMENT;
}
try {
execute();
} catch (InvalidInputException e) {
LOG.error("Invalid input: ", e);
return DistCpConstants.INVALID_ARGUMENT;
} catch (DuplicateFileException e) {
LOG.error("Duplicate files in input path: ", e);
return DistCpConstants.DUPLICATE_INPUT;
} catch (Exception e) {
LOG.error("Exception encountered ", e);
return DistCpConstants.UNKNOWN_ERROR;
}
return DistCpConstants.SUCCESS;
}
/**
* Implements the core-execution. Creates the file-list for copy,
* and launches the Hadoop-job, to do the copy.
* @return Job handle
* @throws Exception, on failure.
*/
public Job execute() throws Exception {
assert inputOptions != null;
assert getConf() != null;
Job job = null;
try {
metaFolder = createMetaFolderPath();
jobFS = metaFolder.getFileSystem(getConf());
job = createJob();
createInputFileListing(job);
job.submit();
submitted = true;
} finally {
if (!submitted) {
cleanup();
}
}
String jobID = job.getJobID().toString();
job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID);
LOG.info("DistCp job-id: " + jobID);
if (inputOptions.shouldBlock()) {
job.waitForCompletion(true);
}
return job;
}
/**
* Create Job object for submitting it, with all the configuration
*
* @return Reference to job object.
* @throws IOException - Exception if any
*/
private Job createJob() throws IOException {
String jobName = "distcp";
String userChosenName = getConf().get(JobContext.JOB_NAME);
if (userChosenName != null)
jobName += ": " + userChosenName;
Job job = Job.getInstance(getConf());
job.setJobName(jobName);
job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions));
job.setJarByClass(CopyMapper.class);
configureOutputFormat(job);
job.setMapperClass(CopyMapper.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputFormatClass(CopyOutputFormat.class);
job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false");
job.getConfiguration().set(JobContext.NUM_MAPS,
String.valueOf(inputOptions.getMaxMaps()));
if (inputOptions.getSslConfigurationFile() != null) {
setupSSLConfig(job);
}
inputOptions.appendToConf(job.getConfiguration());
return job;
}
/**
* Setup ssl configuration on the job configuration to enable hsftp access
* from map job. Also copy the ssl configuration file to Distributed cache
*
* @param job - Reference to job's handle
* @throws java.io.IOException - Exception if unable to locate ssl config file
*/
private void setupSSLConfig(Job job) throws IOException {
Configuration configuration = job.getConfiguration();
Path sslConfigPath = new Path(configuration.
getResource(inputOptions.getSslConfigurationFile()).toString());
addSSLFilesToDistCache(job, sslConfigPath);
configuration.set(DistCpConstants.CONF_LABEL_SSL_CONF, sslConfigPath.getName());
configuration.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfigPath.getName());
}
/**
* Add SSL files to distributed cache. Trust store, key store and ssl config xml
*
* @param job - Job handle
* @param sslConfigPath - ssl Configuration file specified through options
* @throws IOException - If any
*/
private void addSSLFilesToDistCache(Job job,
Path sslConfigPath) throws IOException {
Configuration configuration = job.getConfiguration();
FileSystem localFS = FileSystem.getLocal(configuration);
Configuration sslConf = new Configuration(false);
sslConf.addResource(sslConfigPath);
Path localStorePath = getLocalStorePath(sslConf,
DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION);
job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
localFS.getWorkingDirectory()).toUri());
configuration.set(DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION,
localStorePath.getName());
localStorePath = getLocalStorePath(sslConf,
DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION);
job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
localFS.getWorkingDirectory()).toUri());
configuration.set(DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION,
localStorePath.getName());
job.addCacheFile(sslConfigPath.makeQualified(localFS.getUri(),
localFS.getWorkingDirectory()).toUri());
}
/**
* Get Local Trust store/key store path
*
* @param sslConf - Config from SSL Client xml
* @param storeKey - Key for either trust store or key store
* @return - Path where the store is present
* @throws IOException -If any
*/
private Path getLocalStorePath(Configuration sslConf, String storeKey) throws IOException {
if (sslConf.get(storeKey) != null) {
return new Path(sslConf.get(storeKey));
} else {
throw new IOException("Store for " + storeKey + " is not set in " +
inputOptions.getSslConfigurationFile());
}
}
/**
* Setup output format appropriately
*
* @param job - Job handle
* @throws IOException - Exception if any
*/
private void configureOutputFormat(Job job) throws IOException {
final Configuration configuration = job.getConfiguration();
Path targetPath = inputOptions.getTargetPath();
FileSystem targetFS = targetPath.getFileSystem(configuration);
targetPath = targetPath.makeQualified(targetFS.getUri(),
targetFS.getWorkingDirectory());
if (inputOptions.shouldAtomicCommit()) {
Path workDir = inputOptions.getAtomicWorkPath();
if (workDir == null) {
workDir = targetPath.getParent();
}
workDir = new Path(workDir, WIP_PREFIX + targetPath.getName()
+ rand.nextInt());
FileSystem workFS = workDir.getFileSystem(configuration);
if (!DistCpUtils.compareFs(targetFS, workFS)) {
throw new IllegalArgumentException("Work path " + workDir +
" and target path " + targetPath + " are in different file system");
}
CopyOutputFormat.setWorkingDirectory(job, workDir);
} else {
CopyOutputFormat.setWorkingDirectory(job, targetPath);
}
CopyOutputFormat.setCommitDirectory(job, targetPath);
Path logPath = inputOptions.getLogPath();
if (logPath == null) {
logPath = new Path(metaFolder, "_logs");
} else {
LOG.info("DistCp job log path: " + logPath);
}
CopyOutputFormat.setOutputPath(job, logPath);
}
/**
* Create input listing by invoking an appropriate copy listing
* implementation. Also add delegation tokens for each path
* to job's credential store
*
* @param job - Handle to job
* @return Returns the path where the copy listing is created
* @throws IOException - If any
*/
private Path createInputFileListing(Job job) throws IOException {
Path fileListingPath = getFileListingPath();
CopyListing copyListing = CopyListing.getCopyListing(job.getConfiguration(),
job.getCredentials(), inputOptions);
copyListing.buildListing(fileListingPath, inputOptions);
return fileListingPath;
}
/**
* Get default name of the copy listing file. Use the meta folder
* to create the copy listing file
*
* @return - Path where the copy listing file has to be saved
* @throws IOException - Exception if any
*/
private Path getFileListingPath() throws IOException {
String fileListPathStr = metaFolder + "/fileList.seq";
Path path = new Path(fileListPathStr);
return new Path(path.toUri().normalize().toString());
}
/**
* Create a default working folder for the job, under the
* job staging directory
*
* @return Returns the working folder information
* @throws Exception - EXception if any
*/
private Path createMetaFolderPath() throws Exception {
Configuration configuration = getConf();
Path stagingDir = JobSubmissionFiles.getStagingDir(
new Cluster(configuration), configuration);
Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt()));
if (LOG.isDebugEnabled())
LOG.debug("Meta folder location: " + metaFolderPath);
configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString());
return metaFolderPath;
}
/**
* Main function of the DistCp program. Parses the input arguments (via OptionsParser),
* and invokes the DistCp::run() method, via the ToolRunner.
* @param argv Command-line arguments sent to DistCp.
*/
public static void main(String argv[]) {
try {
DistCp distCp = new DistCp();
Cleanup CLEANUP = new Cleanup(distCp);
Runtime.getRuntime().addShutdownHook(CLEANUP);
System.exit(ToolRunner.run(getDefaultConf(), distCp, argv));
}
catch (Exception e) {
LOG.error("Couldn't complete DistCp operation: ", e);
System.exit(DistCpConstants.UNKNOWN_ERROR);
}
}
/**
* Loads properties from distcp-default.xml into configuration
* object
* @return Configuration which includes properties from distcp-default.xml
*/
private static Configuration getDefaultConf() {
Configuration config = new Configuration();
config.addResource(DISTCP_DEFAULT_XML);
return config;
}
private synchronized void cleanup() {
try {
if (metaFolder == null) return;
jobFS.delete(metaFolder, true);
metaFolder = null;
} catch (IOException e) {
LOG.error("Unable to cleanup meta folder: " + metaFolder, e);
}
}
private boolean isSubmitted() {
return submitted;
}
private static class Cleanup extends Thread {
private final DistCp distCp;
public Cleanup(DistCp distCp) {
this.distCp = distCp;
}
@Override
public void run() {
if (distCp.isSubmitted()) return;
distCp.cleanup();
}
}
}

View File

@ -0,0 +1,104 @@
package org.apache.hadoop.tools;
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Utility class to hold commonly used constants.
*/
public class DistCpConstants {
/* Default number of maps to use for DistCp */
public static final int DEFAULT_MAPS = 20;
/* Default bandwidth if none specified */
public static final int DEFAULT_BANDWIDTH_MB = 100;
/* Default strategy for copying. Implementation looked up
from distcp-default.xml
*/
public static final String UNIFORMSIZE = "uniformsize";
/**
* Constants mapping to command line switches/input options
*/
public static final String CONF_LABEL_ATOMIC_COPY = "distcp.atomic.copy";
public static final String CONF_LABEL_WORK_PATH = "distcp.work.path";
public static final String CONF_LABEL_LOG_PATH = "distcp.log.path";
public static final String CONF_LABEL_IGNORE_FAILURES = "distcp.ignore.failures";
public static final String CONF_LABEL_PRESERVE_STATUS = "distcp.preserve.status";
public static final String CONF_LABEL_SYNC_FOLDERS = "distcp.sync.folders";
public static final String CONF_LABEL_DELETE_MISSING = "distcp.delete.missing.source";
public static final String CONF_LABEL_SSL_CONF = "distcp.keystore.resource";
public static final String CONF_LABEL_MAX_MAPS = "distcp.max.maps";
public static final String CONF_LABEL_SOURCE_LISTING = "distcp.source.listing";
public static final String CONF_LABEL_COPY_STRATEGY = "distcp.copy.strategy";
public static final String CONF_LABEL_SKIP_CRC = "distcp.skip.crc";
public static final String CONF_LABEL_OVERWRITE = "distcp.copy.overwrite";
public static final String CONF_LABEL_BANDWIDTH_MB = "distcp.map.bandwidth.mb";
/* Total bytes to be copied. Updated by copylisting. Unfiltered count */
public static final String CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED = "mapred.total.bytes.expected";
/* Total number of paths to copy, includes directories. Unfiltered count */
public static final String CONF_LABEL_TOTAL_NUMBER_OF_RECORDS = "mapred.number.of.records";
/* SSL keystore resource */
public static final String CONF_LABEL_SSL_KEYSTORE = "dfs.https.client.keystore.resource";
/* If input is based -f <<source listing>>, file containing the src paths */
public static final String CONF_LABEL_LISTING_FILE_PATH = "distcp.listing.file.path";
/* Directory where the mapreduce job will write to. If not atomic commit, then same
as CONF_LABEL_TARGET_FINAL_PATH
*/
public static final String CONF_LABEL_TARGET_WORK_PATH = "distcp.target.work.path";
/* Directory where the final data will be committed to. If not atomic commit, then same
as CONF_LABEL_TARGET_WORK_PATH
*/
public static final String CONF_LABEL_TARGET_FINAL_PATH = "distcp.target.final.path";
/**
* DistCp job id for consumers of the Disctp
*/
public static final String CONF_LABEL_DISTCP_JOB_ID = "distcp.job.id";
/* Meta folder where the job's intermediate data is kept */
public static final String CONF_LABEL_META_FOLDER = "distcp.meta.folder";
/**
* Conf label for SSL Trust-store location.
*/
public static final String CONF_LABEL_SSL_TRUST_STORE_LOCATION
= "ssl.client.truststore.location";
/**
* Conf label for SSL Key-store location.
*/
public static final String CONF_LABEL_SSL_KEY_STORE_LOCATION
= "ssl.client.keystore.location";
/**
* Constants for DistCp return code to shell / consumer of ToolRunner's run
*/
public static final int SUCCESS = 0;
public static final int INVALID_ARGUMENT = -1;
public static final int DUPLICATE_INPUT = -2;
public static final int UNKNOWN_ERROR = -999;
}

View File

@ -0,0 +1,218 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.commons.cli.Option;
import org.apache.hadoop.conf.Configuration;
/**
* Enumeration mapping configuration keys to distcp command line
* options.
*/
public enum DistCpOptionSwitch {
/**
* Ignores any failures during copy, and continues with rest.
* Logs failures in a file
*/
IGNORE_FAILURES(DistCpConstants.CONF_LABEL_IGNORE_FAILURES,
new Option("i", false, "Ignore failures during copy")),
/**
* Preserves status of file/path in the target.
* Default behavior with -p, is to preserve replication,
* block size, user, group and permission on the target file
*
* If any of the optional switches are present among rbugp, then
* only the corresponding file attribute is preserved
*
*/
PRESERVE_STATUS(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
new Option("p", true, "preserve status (rbugp)" +
"(replication, block-size, user, group, permission)")),
/**
* Update target location by copying only files that are missing
* in the target. This can be used to periodically sync two folders
* across source and target. Typically used with DELETE_MISSING
* Incompatible with ATOMIC_COMMIT
*/
SYNC_FOLDERS(DistCpConstants.CONF_LABEL_SYNC_FOLDERS,
new Option("update", false, "Update target, copying only missing" +
"files or directories")),
/**
* Deletes missing files in target that are missing from source
* This allows the target to be in sync with the source contents
* Typically used in conjunction with SYNC_FOLDERS
* Incompatible with ATOMIC_COMMIT
*/
DELETE_MISSING(DistCpConstants.CONF_LABEL_DELETE_MISSING,
new Option("delete", false, "Delete from target, " +
"files missing in source")),
/**
* Configuration file to use with hftps:// for securely copying
* files across clusters. Typically the configuration file contains
* truststore/keystore information such as location, password and type
*/
SSL_CONF(DistCpConstants.CONF_LABEL_SSL_CONF,
new Option("mapredSslConf", true, "Configuration for ssl config file" +
", to use with hftps://")),
/**
* Max number of maps to use during copy. DistCp will split work
* as equally as possible among these maps
*/
MAX_MAPS(DistCpConstants.CONF_LABEL_MAX_MAPS,
new Option("m", true, "Max number of concurrent maps to use for copy")),
/**
* Source file listing can be provided to DistCp in a file.
* This allows DistCp to copy random list of files from source
* and copy them to target
*/
SOURCE_FILE_LISTING(DistCpConstants.CONF_LABEL_SOURCE_LISTING,
new Option("f", true, "List of files that need to be copied")),
/**
* Copy all the source files and commit them atomically to the target
* This is typically useful in cases where there is a process
* polling for availability of a file/dir. This option is incompatible
* with SYNC_FOLDERS & DELETE_MISSING
*/
ATOMIC_COMMIT(DistCpConstants.CONF_LABEL_ATOMIC_COPY,
new Option("atomic", false, "Commit all changes or none")),
/**
* Work path to be used only in conjunction in Atomic commit
*/
WORK_PATH(DistCpConstants.CONF_LABEL_WORK_PATH,
new Option("tmp", true, "Intermediate work path to be used for atomic commit")),
/**
* Log path where distcp output logs are written to
*/
LOG_PATH(DistCpConstants.CONF_LABEL_LOG_PATH,
new Option("log", true, "Folder on DFS where distcp execution logs are saved")),
/**
* Copy strategy is use. This could be dynamic or uniform size etc.
* DistCp would use an appropriate input format based on this.
*/
COPY_STRATEGY(DistCpConstants.CONF_LABEL_COPY_STRATEGY,
new Option("strategy", true, "Copy strategy to use. Default is " +
"dividing work based on file sizes")),
/**
* Skip CRC checks between source and target, when determining what
* files need to be copied.
*/
SKIP_CRC(DistCpConstants.CONF_LABEL_SKIP_CRC,
new Option("skipcrccheck", false, "Whether to skip CRC checks between " +
"source and target paths.")),
/**
* Overwrite target-files unconditionally.
*/
OVERWRITE(DistCpConstants.CONF_LABEL_OVERWRITE,
new Option("overwrite", false, "Choose to overwrite target files " +
"unconditionally, even if they exist.")),
/**
* Should DisctpExecution be blocking
*/
BLOCKING("",
new Option("async", false, "Should distcp execution be blocking")),
FILE_LIMIT("",
new Option("filelimit", true, "(Deprecated!) Limit number of files " +
"copied to <= n")),
SIZE_LIMIT("",
new Option("sizelimit", true, "(Deprecated!) Limit number of files " +
"copied to <= n bytes")),
/**
* Specify bandwidth per map in MB
*/
BANDWIDTH(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
new Option("bandwidth", true, "Specify bandwidth per map in MB"));
private final String confLabel;
private final Option option;
DistCpOptionSwitch(String confLabel, Option option) {
this.confLabel = confLabel;
this.option = option;
}
/**
* Get Configuration label for the option
* @return configuration label name
*/
public String getConfigLabel() {
return confLabel;
}
/**
* Get CLI Option corresponding to the distcp option
* @return option
*/
public Option getOption() {
return option;
}
/**
* Get Switch symbol
* @return switch symbol char
*/
public String getSwitch() {
return option.getOpt();
}
@Override
public String toString() {
return super.name() + " {" +
"confLabel='" + confLabel + '\'' +
", option=" + option + '}';
}
/**
* Helper function to add an option to hadoop configuration object
* @param conf - Configuration object to include the option
* @param option - Option to add
* @param value - Value
*/
public static void addToConf(Configuration conf,
DistCpOptionSwitch option,
String value) {
conf.set(option.getConfigLabel(), value);
}
/**
* Helper function to set an option to hadoop configuration object
* @param conf - Configuration object to include the option
* @param option - Option to add
*/
public static void addToConf(Configuration conf,
DistCpOptionSwitch option) {
conf.set(option.getConfigLabel(), "true");
}
}

View File

@ -0,0 +1,525 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.util.DistCpUtils;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
/**
* The Options class encapsulates all DistCp options.
* These may be set from command-line (via the OptionsParser)
* or may be set manually.
*/
public class DistCpOptions {
private boolean atomicCommit = false;
private boolean syncFolder = false;
private boolean deleteMissing = false;
private boolean ignoreFailures = false;
private boolean overwrite = false;
private boolean skipCRC = false;
private boolean blocking = true;
private int maxMaps = DistCpConstants.DEFAULT_MAPS;
private int mapBandwidth = DistCpConstants.DEFAULT_BANDWIDTH_MB;
private String sslConfigurationFile;
private String copyStrategy = DistCpConstants.UNIFORMSIZE;
private EnumSet<FileAttribute> preserveStatus = EnumSet.noneOf(FileAttribute.class);
private Path atomicWorkPath;
private Path logPath;
private Path sourceFileListing;
private List<Path> sourcePaths;
private Path targetPath;
public static enum FileAttribute{
REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION;
public static FileAttribute getAttribute(char symbol) {
for (FileAttribute attribute : values()) {
if (attribute.name().charAt(0) == Character.toUpperCase(symbol)) {
return attribute;
}
}
throw new NoSuchElementException("No attribute for " + symbol);
}
}
/**
* Constructor, to initialize source/target paths.
* @param sourcePaths List of source-paths (including wildcards)
* to be copied to target.
* @param targetPath Destination path for the dist-copy.
*/
public DistCpOptions(List<Path> sourcePaths, Path targetPath) {
assert sourcePaths != null && !sourcePaths.isEmpty() : "Invalid source paths";
assert targetPath != null : "Invalid Target path";
this.sourcePaths = sourcePaths;
this.targetPath = targetPath;
}
/**
* Constructor, to initialize source/target paths.
* @param sourceFileListing File containing list of source paths
* @param targetPath Destination path for the dist-copy.
*/
public DistCpOptions(Path sourceFileListing, Path targetPath) {
assert sourceFileListing != null : "Invalid source paths";
assert targetPath != null : "Invalid Target path";
this.sourceFileListing = sourceFileListing;
this.targetPath = targetPath;
}
/**
* Copy constructor.
* @param that DistCpOptions being copied from.
*/
public DistCpOptions(DistCpOptions that) {
if (this != that && that != null) {
this.atomicCommit = that.atomicCommit;
this.syncFolder = that.syncFolder;
this.deleteMissing = that.deleteMissing;
this.ignoreFailures = that.ignoreFailures;
this.overwrite = that.overwrite;
this.skipCRC = that.skipCRC;
this.blocking = that.blocking;
this.maxMaps = that.maxMaps;
this.mapBandwidth = that.mapBandwidth;
this.sslConfigurationFile = that.getSslConfigurationFile();
this.copyStrategy = that.copyStrategy;
this.preserveStatus = that.preserveStatus;
this.atomicWorkPath = that.getAtomicWorkPath();
this.logPath = that.getLogPath();
this.sourceFileListing = that.getSourceFileListing();
this.sourcePaths = that.getSourcePaths();
this.targetPath = that.getTargetPath();
}
}
/**
* Should the data be committed atomically?
*
* @return true if data should be committed automically. false otherwise
*/
public boolean shouldAtomicCommit() {
return atomicCommit;
}
/**
* Set if data need to be committed automatically
*
* @param atomicCommit - boolean switch
*/
public void setAtomicCommit(boolean atomicCommit) {
validate(DistCpOptionSwitch.ATOMIC_COMMIT, atomicCommit);
this.atomicCommit = atomicCommit;
}
/**
* Should the data be sync'ed between source and target paths?
*
* @return true if data should be sync'ed up. false otherwise
*/
public boolean shouldSyncFolder() {
return syncFolder;
}
/**
* Set if source and target folder contents be sync'ed up
*
* @param syncFolder - boolean switch
*/
public void setSyncFolder(boolean syncFolder) {
validate(DistCpOptionSwitch.SYNC_FOLDERS, syncFolder);
this.syncFolder = syncFolder;
}
/**
* Should target files missing in source should be deleted?
*
* @return true if zoombie target files to be removed. false otherwise
*/
public boolean shouldDeleteMissing() {
return deleteMissing;
}
/**
* Set if files only present in target should be deleted
*
* @param deleteMissing - boolean switch
*/
public void setDeleteMissing(boolean deleteMissing) {
validate(DistCpOptionSwitch.DELETE_MISSING, deleteMissing);
this.deleteMissing = deleteMissing;
}
/**
* Should failures be logged and ignored during copy?
*
* @return true if failures are to be logged and ignored. false otherwise
*/
public boolean shouldIgnoreFailures() {
return ignoreFailures;
}
/**
* Set if failures during copy be ignored
*
* @param ignoreFailures - boolean switch
*/
public void setIgnoreFailures(boolean ignoreFailures) {
this.ignoreFailures = ignoreFailures;
}
/**
* Should DistCp be running in blocking mode
*
* @return true if should run in blocking, false otherwise
*/
public boolean shouldBlock() {
return blocking;
}
/**
* Set if Disctp should run blocking or non-blocking
*
* @param blocking - boolean switch
*/
public void setBlocking(boolean blocking) {
this.blocking = blocking;
}
/**
* Should files be overwritten always?
*
* @return true if files in target that may exist before distcp, should always
* be overwritten. false otherwise
*/
public boolean shouldOverwrite() {
return overwrite;
}
/**
* Set if files should always be overwritten on target
*
* @param overwrite - boolean switch
*/
public void setOverwrite(boolean overwrite) {
validate(DistCpOptionSwitch.OVERWRITE, overwrite);
this.overwrite = overwrite;
}
/**
* Should CRC/checksum check be skipped while checking files are identical
*
* @return true if checksum check should be skipped while checking files are
* identical. false otherwise
*/
public boolean shouldSkipCRC() {
return skipCRC;
}
/**
* Set if checksum comparison should be skipped while determining if
* source and destination files are identical
*
* @param skipCRC - boolean switch
*/
public void setSkipCRC(boolean skipCRC) {
validate(DistCpOptionSwitch.SKIP_CRC, skipCRC);
this.skipCRC = skipCRC;
}
/** Get the max number of maps to use for this copy
*
* @return Max number of maps
*/
public int getMaxMaps() {
return maxMaps;
}
/**
* Set the max number of maps to use for copy
*
* @param maxMaps - Number of maps
*/
public void setMaxMaps(int maxMaps) {
this.maxMaps = maxMaps;
}
/** Get the map bandwidth in MB
*
* @return Bandwidth in MB
*/
public int getMapBandwidth() {
return mapBandwidth;
}
/**
* Set per map bandwidth
*
* @param mapBandwidth - per map bandwidth
*/
public void setMapBandwidth(int mapBandwidth) {
assert mapBandwidth > 0 : "Bandwidth " + mapBandwidth + " is invalid (should be > 0)";
this.mapBandwidth = mapBandwidth;
}
/**
* Get path where the ssl configuration file is present to use for hftps://
*
* @return Path on local file system
*/
public String getSslConfigurationFile() {
return sslConfigurationFile;
}
/**
* Set the SSL configuration file path to use with hftps:// (local path)
*
* @param sslConfigurationFile - Local ssl config file path
*/
public void setSslConfigurationFile(String sslConfigurationFile) {
this.sslConfigurationFile = sslConfigurationFile;
}
/**
* Returns an iterator with the list of file attributes to preserve
*
* @return iterator of file attributes to preserve
*/
public Iterator<FileAttribute> preserveAttributes() {
return preserveStatus.iterator();
}
/**
* Checks if the input attibute should be preserved or not
*
* @param attribute - Attribute to check
* @return True if attribute should be preserved, false otherwise
*/
public boolean shouldPreserve(FileAttribute attribute) {
return preserveStatus.contains(attribute);
}
/**
* Add file attributes that need to be preserved. This method may be
* called multiple times to add attributes.
*
* @param fileAttribute - Attribute to add, one at a time
*/
public void preserve(FileAttribute fileAttribute) {
for (FileAttribute attribute : preserveStatus) {
if (attribute.equals(fileAttribute)) {
return;
}
}
preserveStatus.add(fileAttribute);
}
/** Get work path for atomic commit. If null, the work
* path would be parentOf(targetPath) + "/._WIP_" + nameOf(targetPath)
*
* @return Atomic work path on the target cluster. Null if not set
*/
public Path getAtomicWorkPath() {
return atomicWorkPath;
}
/**
* Set the work path for atomic commit
*
* @param atomicWorkPath - Path on the target cluster
*/
public void setAtomicWorkPath(Path atomicWorkPath) {
this.atomicWorkPath = atomicWorkPath;
}
/** Get output directory for writing distcp logs. Otherwise logs
* are temporarily written to JobStagingDir/_logs and deleted
* upon job completion
*
* @return Log output path on the cluster where distcp job is run
*/
public Path getLogPath() {
return logPath;
}
/**
* Set the log path where distcp output logs are stored
* Uses JobStagingDir/_logs by default
*
* @param logPath - Path where logs will be saved
*/
public void setLogPath(Path logPath) {
this.logPath = logPath;
}
/**
* Get the copy strategy to use. Uses appropriate input format
*
* @return copy strategy to use
*/
public String getCopyStrategy() {
return copyStrategy;
}
/**
* Set the copy strategy to use. Should map to a strategy implementation
* in distp-default.xml
*
* @param copyStrategy - copy Strategy to use
*/
public void setCopyStrategy(String copyStrategy) {
this.copyStrategy = copyStrategy;
}
/**
* File path (hdfs:// or file://) that contains the list of actual
* files to copy
*
* @return - Source listing file path
*/
public Path getSourceFileListing() {
return sourceFileListing;
}
/**
* Getter for sourcePaths.
* @return List of source-paths.
*/
public List<Path> getSourcePaths() {
return sourcePaths;
}
/**
* Setter for sourcePaths.
* @param sourcePaths The new list of source-paths.
*/
public void setSourcePaths(List<Path> sourcePaths) {
assert sourcePaths != null && sourcePaths.size() != 0;
this.sourcePaths = sourcePaths;
}
/**
* Getter for the targetPath.
* @return The target-path.
*/
public Path getTargetPath() {
return targetPath;
}
public void validate(DistCpOptionSwitch option, boolean value) {
boolean syncFolder = (option == DistCpOptionSwitch.SYNC_FOLDERS ?
value : this.syncFolder);
boolean overwrite = (option == DistCpOptionSwitch.OVERWRITE ?
value : this.overwrite);
boolean deleteMissing = (option == DistCpOptionSwitch.DELETE_MISSING ?
value : this.deleteMissing);
boolean atomicCommit = (option == DistCpOptionSwitch.ATOMIC_COMMIT ?
value : this.atomicCommit);
boolean skipCRC = (option == DistCpOptionSwitch.SKIP_CRC ?
value : this.skipCRC);
if (syncFolder && atomicCommit) {
throw new IllegalArgumentException("Atomic commit can't be used with " +
"sync folder or overwrite options");
}
if (deleteMissing && !(overwrite || syncFolder)) {
throw new IllegalArgumentException("Delete missing is applicable " +
"only with update or overwrite options");
}
if (overwrite && syncFolder) {
throw new IllegalArgumentException("Overwrite and update options are " +
"mutually exclusive");
}
if (!syncFolder && skipCRC) {
throw new IllegalArgumentException("Skip CRC is valid only with update options");
}
}
/**
* Add options to configuration. These will be used in the Mapper/committer
*
* @param conf - Configruation object to which the options need to be added
*/
public void appendToConf(Configuration conf) {
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT,
String.valueOf(atomicCommit));
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.IGNORE_FAILURES,
String.valueOf(ignoreFailures));
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SYNC_FOLDERS,
String.valueOf(syncFolder));
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DELETE_MISSING,
String.valueOf(deleteMissing));
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.OVERWRITE,
String.valueOf(overwrite));
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SKIP_CRC,
String.valueOf(skipCRC));
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BANDWIDTH,
String.valueOf(mapBandwidth));
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.PRESERVE_STATUS,
DistCpUtils.packAttributes(preserveStatus));
}
/**
* Utility to easily string-ify Options, for logging.
*
* @return String representation of the Options.
*/
@Override
public String toString() {
return "DistCpOptions{" +
"atomicCommit=" + atomicCommit +
", syncFolder=" + syncFolder +
", deleteMissing=" + deleteMissing +
", ignoreFailures=" + ignoreFailures +
", maxMaps=" + maxMaps +
", sslConfigurationFile='" + sslConfigurationFile + '\'' +
", copyStrategy='" + copyStrategy + '\'' +
", sourceFileListing=" + sourceFileListing +
", sourcePaths=" + sourcePaths +
", targetPath=" + targetPath +
'}';
}
@Override
protected DistCpOptions clone() throws CloneNotSupportedException {
return (DistCpOptions) super.clone();
}
}

View File

@ -0,0 +1,100 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.security.Credentials;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
/**
* FileBasedCopyListing implements the CopyListing interface,
* to create the copy-listing for DistCp,
* by iterating over all source paths mentioned in a specified input-file.
*/
public class FileBasedCopyListing extends CopyListing {
private final CopyListing globbedListing;
/**
* Constructor, to initialize base-class.
* @param configuration The input Configuration object.
* @param credentials - Credentials object on which the FS delegation tokens are cached. If null
* delegation token caching is skipped
*/
public FileBasedCopyListing(Configuration configuration, Credentials credentials) {
super(configuration, credentials);
globbedListing = new GlobbedCopyListing(getConf(), credentials);
}
/** {@inheritDoc} */
@Override
protected void validatePaths(DistCpOptions options)
throws IOException, InvalidInputException {
}
/**
* Implementation of CopyListing::buildListing().
* Iterates over all source paths mentioned in the input-file.
* @param pathToListFile Path on HDFS where the listing file is written.
* @param options Input Options for DistCp (indicating source/target paths.)
* @throws IOException
*/
@Override
public void doBuildListing(Path pathToListFile, DistCpOptions options) throws IOException {
DistCpOptions newOption = new DistCpOptions(options);
newOption.setSourcePaths(fetchFileList(options.getSourceFileListing()));
globbedListing.buildListing(pathToListFile, newOption);
}
private List<Path> fetchFileList(Path sourceListing) throws IOException {
List<Path> result = new ArrayList<Path>();
FileSystem fs = sourceListing.getFileSystem(getConf());
BufferedReader input = null;
try {
input = new BufferedReader(new InputStreamReader(fs.open(sourceListing)));
String line = input.readLine();
while (line != null) {
result.add(new Path(line));
line = input.readLine();
}
} finally {
IOUtils.closeStream(input);
}
return result;
}
/** {@inheritDoc} */
@Override
protected long getBytesToCopy() {
return globbedListing.getBytesToCopy();
}
/** {@inheritDoc} */
@Override
protected long getNumberOfPaths() {
return globbedListing.getNumberOfPaths();
}
}

View File

@ -0,0 +1,105 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.Credentials;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
/**
* GlobbedCopyListing implements the CopyListing interface, to create the copy
* listing-file by "globbing" all specified source paths (wild-cards and all.)
*/
public class GlobbedCopyListing extends CopyListing {
private static final Log LOG = LogFactory.getLog(GlobbedCopyListing.class);
private final CopyListing simpleListing;
/**
* Constructor, to initialize the configuration.
* @param configuration The input Configuration object.
* @param credentials Credentials object on which the FS delegation tokens are cached. If null
* delegation token caching is skipped
*/
public GlobbedCopyListing(Configuration configuration, Credentials credentials) {
super(configuration, credentials);
simpleListing = new SimpleCopyListing(getConf(), credentials) ;
}
/** {@inheritDoc} */
@Override
protected void validatePaths(DistCpOptions options)
throws IOException, InvalidInputException {
}
/**
* Implementation of CopyListing::buildListing().
* Creates the copy listing by "globbing" all source-paths.
* @param pathToListingFile The location at which the copy-listing file
* is to be created.
* @param options Input Options for DistCp (indicating source/target paths.)
* @throws IOException
*/
@Override
public void doBuildListing(Path pathToListingFile,
DistCpOptions options) throws IOException {
List<Path> globbedPaths = new ArrayList<Path>();
if (options.getSourcePaths().isEmpty()) {
throw new InvalidInputException("Nothing to process. Source paths::EMPTY");
}
for (Path p : options.getSourcePaths()) {
FileSystem fs = p.getFileSystem(getConf());
FileStatus[] inputs = fs.globStatus(p);
if(inputs != null && inputs.length > 0) {
for (FileStatus onePath: inputs) {
globbedPaths.add(onePath.getPath());
}
} else {
throw new InvalidInputException(p + " doesn't exist");
}
}
DistCpOptions optionsGlobbed = new DistCpOptions(options);
optionsGlobbed.setSourcePaths(globbedPaths);
simpleListing.buildListing(pathToListingFile, optionsGlobbed);
}
/** {@inheritDoc} */
@Override
protected long getBytesToCopy() {
return simpleListing.getBytesToCopy();
}
/** {@inheritDoc} */
@Override
protected long getNumberOfPaths() {
return simpleListing.getNumberOfPaths();
}
}

View File

@ -0,0 +1,246 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.commons.cli.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
import java.util.*;
/**
* The OptionsParser parses out the command-line options passed to DistCp,
* and interprets those specific to DistCp, to create an Options object.
*/
public class OptionsParser {
private static final Log LOG = LogFactory.getLog(OptionsParser.class);
private static final Options cliOptions = new Options();
static {
for (DistCpOptionSwitch option : DistCpOptionSwitch.values()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Adding option " + option.getOption());
}
cliOptions.addOption(option.getOption());
}
}
private static class CustomParser extends GnuParser {
@Override
protected String[] flatten(Options options, String[] arguments, boolean stopAtNonOption) {
for (int index = 0; index < arguments.length; index++) {
if (arguments[index].equals("-" + DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
arguments[index] = "-prbugp";
}
}
return super.flatten(options, arguments, stopAtNonOption);
}
}
/**
* The parse method parses the command-line options, and creates
* a corresponding Options object.
* @param args Command-line arguments (excluding the options consumed
* by the GenericOptionsParser).
* @return The Options object, corresponding to the specified command-line.
* @throws IllegalArgumentException: Thrown if the parse fails.
*/
public static DistCpOptions parse(String args[]) throws IllegalArgumentException {
CommandLineParser parser = new CustomParser();
CommandLine command;
try {
command = parser.parse(cliOptions, args, true);
} catch (ParseException e) {
throw new IllegalArgumentException("Unable to parse arguments. " +
Arrays.toString(args), e);
}
DistCpOptions option;
Path targetPath;
List<Path> sourcePaths = new ArrayList<Path>();
String leftOverArgs[] = command.getArgs();
if (leftOverArgs == null || leftOverArgs.length < 1) {
throw new IllegalArgumentException("Target path not specified");
}
//Last Argument is the target path
targetPath = new Path(leftOverArgs[leftOverArgs.length -1].trim());
//Copy any source paths in the arguments to the list
for (int index = 0; index < leftOverArgs.length - 1; index++) {
sourcePaths.add(new Path(leftOverArgs[index].trim()));
}
/* If command has source file listing, use it else, fall back on source paths in args
If both are present, throw exception and bail */
if (command.hasOption(DistCpOptionSwitch.SOURCE_FILE_LISTING.getSwitch())) {
if (!sourcePaths.isEmpty()) {
throw new IllegalArgumentException("Both source file listing and source paths present");
}
option = new DistCpOptions(new Path(getVal(command, DistCpOptionSwitch.
SOURCE_FILE_LISTING.getSwitch())), targetPath);
} else {
if (sourcePaths.isEmpty()) {
throw new IllegalArgumentException("Neither source file listing nor source paths present");
}
option = new DistCpOptions(sourcePaths, targetPath);
}
//Process all the other option switches and set options appropriately
if (command.hasOption(DistCpOptionSwitch.IGNORE_FAILURES.getSwitch())) {
option.setIgnoreFailures(true);
}
if (command.hasOption(DistCpOptionSwitch.ATOMIC_COMMIT.getSwitch())) {
option.setAtomicCommit(true);
}
if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch()) &&
option.shouldAtomicCommit()) {
String workPath = getVal(command, DistCpOptionSwitch.WORK_PATH.getSwitch());
if (workPath != null && !workPath.isEmpty()) {
option.setAtomicWorkPath(new Path(workPath));
}
} else if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch())) {
throw new IllegalArgumentException("-tmp work-path can only be specified along with -atomic");
}
if (command.hasOption(DistCpOptionSwitch.LOG_PATH.getSwitch())) {
option.setLogPath(new Path(getVal(command, DistCpOptionSwitch.LOG_PATH.getSwitch())));
}
if (command.hasOption(DistCpOptionSwitch.SYNC_FOLDERS.getSwitch())) {
option.setSyncFolder(true);
}
if (command.hasOption(DistCpOptionSwitch.OVERWRITE.getSwitch())) {
option.setOverwrite(true);
}
if (command.hasOption(DistCpOptionSwitch.DELETE_MISSING.getSwitch())) {
option.setDeleteMissing(true);
}
if (command.hasOption(DistCpOptionSwitch.SKIP_CRC.getSwitch())) {
option.setSkipCRC(true);
}
if (command.hasOption(DistCpOptionSwitch.BLOCKING.getSwitch())) {
option.setBlocking(false);
}
if (command.hasOption(DistCpOptionSwitch.BANDWIDTH.getSwitch())) {
try {
Integer mapBandwidth = Integer.parseInt(
getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()).trim());
option.setMapBandwidth(mapBandwidth);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Bandwidth specified is invalid: " +
getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()), e);
}
}
if (command.hasOption(DistCpOptionSwitch.SSL_CONF.getSwitch())) {
option.setSslConfigurationFile(command.
getOptionValue(DistCpOptionSwitch.SSL_CONF.getSwitch()));
}
if (command.hasOption(DistCpOptionSwitch.MAX_MAPS.getSwitch())) {
try {
Integer maps = Integer.parseInt(
getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()).trim());
option.setMaxMaps(maps);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Number of maps is invalid: " +
getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()), e);
}
}
if (command.hasOption(DistCpOptionSwitch.COPY_STRATEGY.getSwitch())) {
option.setCopyStrategy(
getVal(command, DistCpOptionSwitch.COPY_STRATEGY.getSwitch()));
}
if (command.hasOption(DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
String attributes =
getVal(command, DistCpOptionSwitch.PRESERVE_STATUS.getSwitch());
if (attributes == null || attributes.isEmpty()) {
for (FileAttribute attribute : FileAttribute.values()) {
option.preserve(attribute);
}
} else {
for (int index = 0; index < attributes.length(); index++) {
option.preserve(FileAttribute.
getAttribute(attributes.charAt(index)));
}
}
}
if (command.hasOption(DistCpOptionSwitch.FILE_LIMIT.getSwitch())) {
String fileLimitString = getVal(command,
DistCpOptionSwitch.FILE_LIMIT.getSwitch().trim());
try {
Integer.parseInt(fileLimitString);
}
catch (NumberFormatException e) {
throw new IllegalArgumentException("File-limit is invalid: "
+ fileLimitString, e);
}
LOG.warn(DistCpOptionSwitch.FILE_LIMIT.getSwitch() + " is a deprecated" +
" option. Ignoring.");
}
if (command.hasOption(DistCpOptionSwitch.SIZE_LIMIT.getSwitch())) {
String sizeLimitString = getVal(command,
DistCpOptionSwitch.SIZE_LIMIT.getSwitch().trim());
try {
Long.parseLong(sizeLimitString);
}
catch (NumberFormatException e) {
throw new IllegalArgumentException("Size-limit is invalid: "
+ sizeLimitString, e);
}
LOG.warn(DistCpOptionSwitch.SIZE_LIMIT.getSwitch() + " is a deprecated" +
" option. Ignoring.");
}
return option;
}
private static String getVal(CommandLine command, String swtch) {
String optionValue = command.getOptionValue(swtch);
if (optionValue == null) {
return null;
} else {
return optionValue.trim();
}
}
public static void usage() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("distcp OPTIONS [source_path...] <target_path>\n\nOPTIONS", cliOptions);
}
}

View File

@ -0,0 +1,275 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.security.Credentials;
import java.io.*;
import java.util.Stack;
/**
* The SimpleCopyListing is responsible for making the exhaustive list of
* all files/directories under its specified list of input-paths.
* These are written into the specified copy-listing file.
* Note: The SimpleCopyListing doesn't handle wild-cards in the input-paths.
*/
public class SimpleCopyListing extends CopyListing {
private static final Log LOG = LogFactory.getLog(SimpleCopyListing.class);
private long totalPaths = 0;
private long totalBytesToCopy = 0;
/**
* Protected constructor, to initialize configuration.
*
* @param configuration The input configuration, with which the source/target FileSystems may be accessed.
* @param credentials - Credentials object on which the FS delegation tokens are cached. If null
* delegation token caching is skipped
*/
protected SimpleCopyListing(Configuration configuration, Credentials credentials) {
super(configuration, credentials);
}
@Override
protected void validatePaths(DistCpOptions options)
throws IOException, InvalidInputException {
Path targetPath = options.getTargetPath();
FileSystem targetFS = targetPath.getFileSystem(getConf());
boolean targetIsFile = targetFS.isFile(targetPath);
//If target is a file, then source has to be single file
if (targetIsFile) {
if (options.getSourcePaths().size() > 1) {
throw new InvalidInputException("Multiple source being copied to a file: " +
targetPath);
}
Path srcPath = options.getSourcePaths().get(0);
FileSystem sourceFS = srcPath.getFileSystem(getConf());
if (!sourceFS.isFile(srcPath)) {
throw new InvalidInputException("Cannot copy " + srcPath +
", which is not a file to " + targetPath);
}
}
if (options.shouldAtomicCommit() && targetFS.exists(targetPath)) {
throw new InvalidInputException("Target path for atomic-commit already exists: " +
targetPath + ". Cannot atomic-commit to pre-existing target-path.");
}
for (Path path: options.getSourcePaths()) {
FileSystem fs = path.getFileSystem(getConf());
if (!fs.exists(path)) {
throw new InvalidInputException(path + " doesn't exist");
}
}
/* This is requires to allow map tasks to access each of the source
clusters. This would retrieve the delegation token for each unique
file system and add them to job's private credential store
*/
Credentials credentials = getCredentials();
if (credentials != null) {
Path[] inputPaths = options.getSourcePaths().toArray(new Path[1]);
TokenCache.obtainTokensForNamenodes(credentials, inputPaths, getConf());
}
}
/** {@inheritDoc} */
@Override
public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException {
SequenceFile.Writer fileListWriter = null;
try {
fileListWriter = getWriter(pathToListingFile);
for (Path path: options.getSourcePaths()) {
FileSystem sourceFS = path.getFileSystem(getConf());
path = makeQualified(path);
FileStatus rootStatus = sourceFS.getFileStatus(path);
Path sourcePathRoot = computeSourceRootPath(rootStatus, options);
boolean localFile = (rootStatus.getClass() != FileStatus.class);
FileStatus[] sourceFiles = sourceFS.listStatus(path);
if (sourceFiles != null && sourceFiles.length > 0) {
for (FileStatus sourceStatus: sourceFiles) {
if (LOG.isDebugEnabled()) {
LOG.debug("Recording source-path: " + sourceStatus.getPath() + " for copy.");
}
writeToFileListing(fileListWriter, sourceStatus, sourcePathRoot, localFile);
if (isDirectoryAndNotEmpty(sourceFS, sourceStatus)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Traversing non-empty source dir: " + sourceStatus.getPath());
}
traverseNonEmptyDirectory(fileListWriter, sourceStatus, sourcePathRoot, localFile);
}
}
} else {
writeToFileListing(fileListWriter, rootStatus, sourcePathRoot, localFile);
}
}
} finally {
IOUtils.closeStream(fileListWriter);
}
}
private Path computeSourceRootPath(FileStatus sourceStatus,
DistCpOptions options) throws IOException {
Path target = options.getTargetPath();
FileSystem targetFS = target.getFileSystem(getConf());
boolean solitaryFile = options.getSourcePaths().size() == 1
&& !sourceStatus.isDirectory();
if (solitaryFile) {
if (targetFS.isFile(target) || !targetFS.exists(target)) {
return sourceStatus.getPath();
} else {
return sourceStatus.getPath().getParent();
}
} else {
boolean specialHandling = (options.getSourcePaths().size() == 1 && !targetFS.exists(target)) ||
options.shouldSyncFolder() || options.shouldOverwrite();
return specialHandling && sourceStatus.isDirectory() ? sourceStatus.getPath() :
sourceStatus.getPath().getParent();
}
}
/** {@inheritDoc} */
@Override
protected long getBytesToCopy() {
return totalBytesToCopy;
}
/** {@inheritDoc} */
@Override
protected long getNumberOfPaths() {
return totalPaths;
}
private Path makeQualified(Path path) throws IOException {
final FileSystem fs = path.getFileSystem(getConf());
return path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
}
private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException {
FileSystem fs = pathToListFile.getFileSystem(getConf());
if (fs.exists(pathToListFile)) {
fs.delete(pathToListFile, false);
}
return SequenceFile.createWriter(getConf(),
SequenceFile.Writer.file(pathToListFile),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(FileStatus.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.NONE));
}
private static boolean isDirectoryAndNotEmpty(FileSystem fileSystem,
FileStatus fileStatus) throws IOException {
return fileStatus.isDirectory() && getChildren(fileSystem, fileStatus).length > 0;
}
private static FileStatus[] getChildren(FileSystem fileSystem,
FileStatus parent) throws IOException {
return fileSystem.listStatus(parent.getPath());
}
private void traverseNonEmptyDirectory(SequenceFile.Writer fileListWriter,
FileStatus sourceStatus,
Path sourcePathRoot, boolean localFile)
throws IOException {
FileSystem sourceFS = sourcePathRoot.getFileSystem(getConf());
Stack<FileStatus> pathStack = new Stack<FileStatus>();
pathStack.push(sourceStatus);
while (!pathStack.isEmpty()) {
for (FileStatus child: getChildren(sourceFS, pathStack.pop())) {
if (LOG.isDebugEnabled())
LOG.debug("Recording source-path: "
+ sourceStatus.getPath() + " for copy.");
writeToFileListing(fileListWriter, child, sourcePathRoot, localFile);
if (isDirectoryAndNotEmpty(sourceFS, child)) {
if (LOG.isDebugEnabled())
LOG.debug("Traversing non-empty source dir: "
+ sourceStatus.getPath());
pathStack.push(child);
}
}
}
}
private void writeToFileListing(SequenceFile.Writer fileListWriter,
FileStatus fileStatus, Path sourcePathRoot,
boolean localFile) throws IOException {
if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDirectory())
return; // Skip the root-paths.
if (LOG.isDebugEnabled()) {
LOG.debug("REL PATH: " + DistCpUtils.getRelativePath(sourcePathRoot,
fileStatus.getPath()) + ", FULL PATH: " + fileStatus.getPath());
}
FileStatus status = fileStatus;
if (localFile) {
status = getFileStatus(fileStatus);
}
fileListWriter.append(new Text(DistCpUtils.getRelativePath(sourcePathRoot,
fileStatus.getPath())), status);
fileListWriter.sync();
if (!fileStatus.isDirectory()) {
totalBytesToCopy += fileStatus.getLen();
}
totalPaths++;
}
private static final ByteArrayOutputStream buffer = new ByteArrayOutputStream(64);
private DataInputBuffer in = new DataInputBuffer();
private FileStatus getFileStatus(FileStatus fileStatus) throws IOException {
FileStatus status = new FileStatus();
buffer.reset();
DataOutputStream out = new DataOutputStream(buffer);
fileStatus.write(out);
in.reset(buffer.toByteArray(), 0, buffer.size());
status.readFields(in);
return status;
}
}

View File

@ -0,0 +1,297 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.tools.*;
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
import org.apache.hadoop.tools.util.DistCpUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
/**
* The CopyCommitter class is DistCp's OutputCommitter implementation. It is
* responsible for handling the completion/cleanup of the DistCp run.
* Specifically, it does the following:
* 1. Cleanup of the meta-folder (where DistCp maintains its file-list, etc.)
* 2. Preservation of user/group/replication-factor on any directories that
* have been copied. (Files are taken care of in their map-tasks.)
* 3. Atomic-move of data from the temporary work-folder to the final path
* (if atomic-commit was opted for).
* 4. Deletion of files from the target that are missing at source (if opted for).
* 5. Cleanup of any partially copied files, from previous, failed attempts.
*/
public class CopyCommitter extends FileOutputCommitter {
private static final Log LOG = LogFactory.getLog(CopyCommitter.class);
private final TaskAttemptContext taskAttemptContext;
/**
* Create a output committer
*
* @param outputPath the job's output path
* @param context the task's context
* @throws IOException - Exception if any
*/
public CopyCommitter(Path outputPath, TaskAttemptContext context) throws IOException {
super(outputPath, context);
this.taskAttemptContext = context;
}
/** @inheritDoc */
@Override
public void commitJob(JobContext jobContext) throws IOException {
Configuration conf = jobContext.getConfiguration();
super.commitJob(jobContext);
cleanupTempFiles(jobContext);
String attributes = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
if (attributes != null && !attributes.isEmpty()) {
preserveFileAttributesForDirectories(conf);
}
try {
if (conf.getBoolean(DistCpConstants.CONF_LABEL_DELETE_MISSING, false)) {
deleteMissing(conf);
} else if (conf.getBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, false)) {
commitData(conf);
}
taskAttemptContext.setStatus("Commit Successful");
}
finally {
cleanup(conf);
}
}
/** @inheritDoc */
@Override
public void abortJob(JobContext jobContext,
JobStatus.State state) throws IOException {
try {
super.abortJob(jobContext, state);
} finally {
cleanupTempFiles(jobContext);
cleanup(jobContext.getConfiguration());
}
}
private void cleanupTempFiles(JobContext context) {
try {
Configuration conf = context.getConfiguration();
Path targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
FileSystem targetFS = targetWorkPath.getFileSystem(conf);
String jobId = context.getJobID().toString();
deleteAttemptTempFiles(targetWorkPath, targetFS, jobId);
deleteAttemptTempFiles(targetWorkPath.getParent(), targetFS, jobId);
} catch (Throwable t) {
LOG.warn("Unable to cleanup temp files", t);
}
}
private void deleteAttemptTempFiles(Path targetWorkPath,
FileSystem targetFS,
String jobId) throws IOException {
FileStatus[] tempFiles = targetFS.globStatus(
new Path(targetWorkPath, ".distcp.tmp." + jobId.replaceAll("job","attempt") + "*"));
if (tempFiles != null && tempFiles.length > 0) {
for (FileStatus file : tempFiles) {
LOG.info("Cleaning up " + file.getPath());
targetFS.delete(file.getPath(), false);
}
}
}
/**
* Cleanup meta folder and other temporary files
*
* @param conf - Job Configuration
*/
private void cleanup(Configuration conf) {
Path metaFolder = new Path(conf.get(DistCpConstants.CONF_LABEL_META_FOLDER));
try {
FileSystem fs = metaFolder.getFileSystem(conf);
LOG.info("Cleaning up temporary work folder: " + metaFolder);
fs.delete(metaFolder, true);
} catch (IOException ignore) {
LOG.error("Exception encountered ", ignore);
}
}
// This method changes the target-directories' file-attributes (owner,
// user/group permissions, etc.) based on the corresponding source directories.
private void preserveFileAttributesForDirectories(Configuration conf) throws IOException {
String attrSymbols = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
LOG.info("About to preserve attributes: " + attrSymbols);
EnumSet<FileAttribute> attributes = DistCpUtils.unpackAttributes(attrSymbols);
Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
FileSystem clusterFS = sourceListing.getFileSystem(conf);
SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(sourceListing));
long totalLen = clusterFS.getFileStatus(sourceListing).getLen();
Path targetRoot = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
long preservedEntries = 0;
try {
FileStatus srcFileStatus = new FileStatus();
Text srcRelPath = new Text();
// Iterate over every source path that was copied.
while (sourceReader.next(srcRelPath, srcFileStatus)) {
// File-attributes for files are set at the time of copy,
// in the map-task.
if (! srcFileStatus.isDirectory()) continue;
Path targetFile = new Path(targetRoot.toString() + "/" + srcRelPath);
// Skip the root folder.
// Status can't be preserved on root-folder. (E.g. multiple paths may
// be copied to a single target folder. Which source-attributes to use
// on the target is undefined.)
if (targetRoot.equals(targetFile)) continue;
FileSystem targetFS = targetFile.getFileSystem(conf);
DistCpUtils.preserve(targetFS, targetFile, srcFileStatus, attributes);
taskAttemptContext.progress();
taskAttemptContext.setStatus("Preserving status on directory entries. [" +
sourceReader.getPosition() * 100 / totalLen + "%]");
}
} finally {
IOUtils.closeStream(sourceReader);
}
LOG.info("Preserved status on " + preservedEntries + " dir entries on target");
}
// This method deletes "extra" files from the target, if they're not
// available at the source.
private void deleteMissing(Configuration conf) throws IOException {
LOG.info("-delete option is enabled. About to remove entries from " +
"target that are missing in source");
// Sort the source-file listing alphabetically.
Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
FileSystem clusterFS = sourceListing.getFileSystem(conf);
Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
// Similarly, create the listing of target-files. Sort alphabetically.
Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
List<Path> targets = new ArrayList<Path>(1);
Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
targets.add(targetFinalPath);
DistCpOptions options = new DistCpOptions(targets, new Path("/NONE"));
target.buildListing(targetListing, options);
Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(sortedSourceListing));
SequenceFile.Reader targetReader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(sortedTargetListing));
// Walk both source and target file listings.
// Delete all from target that doesn't also exist on source.
long deletedEntries = 0;
try {
FileStatus srcFileStatus = new FileStatus();
Text srcRelPath = new Text();
FileStatus trgtFileStatus = new FileStatus();
Text trgtRelPath = new Text();
FileSystem targetFS = targetFinalPath.getFileSystem(conf);
boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
while (targetReader.next(trgtRelPath, trgtFileStatus)) {
// Skip sources that don't exist on target.
while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
}
if (srcAvailable && trgtRelPath.equals(srcRelPath)) continue;
// Target doesn't exist at source. Delete.
boolean result = (!targetFS.exists(trgtFileStatus.getPath()) ||
targetFS.delete(trgtFileStatus.getPath(), true));
if (result) {
LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
deletedEntries++;
} else {
throw new IOException("Unable to delete " + trgtFileStatus.getPath());
}
taskAttemptContext.progress();
taskAttemptContext.setStatus("Deleting missing files from target. [" +
targetReader.getPosition() * 100 / totalLen + "%]");
}
} finally {
IOUtils.closeStream(sourceReader);
IOUtils.closeStream(targetReader);
}
LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
}
private void commitData(Configuration conf) throws IOException {
Path workDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
Path finalDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
FileSystem targetFS = workDir.getFileSystem(conf);
LOG.info("Atomic commit enabled. Moving " + workDir + " to " + finalDir);
if (targetFS.exists(finalDir) && targetFS.exists(workDir)) {
LOG.error("Pre-existing final-path found at: " + finalDir);
throw new IOException("Target-path can't be committed to because it " +
"exists at " + finalDir + ". Copied data is in temp-dir: " + workDir + ". ");
}
boolean result = targetFS.rename(workDir, finalDir);
if (!result) {
LOG.warn("Rename failed. Perhaps data already moved. Verifying...");
result = targetFS.exists(finalDir) && !targetFS.exists(workDir);
}
if (result) {
LOG.info("Data committed successfully to " + finalDir);
taskAttemptContext.setStatus("Data committed successfully to " + finalDir);
} else {
LOG.error("Unable to commit data to " + finalDir);
throw new IOException("Atomic commit failed. Temporary data in " + workDir +
", Unable to move to " + finalDir);
}
}
}

View File

@ -0,0 +1,330 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.tools.DistCpOptionSwitch;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.util.StringUtils;
import java.io.*;
import java.util.EnumSet;
import java.util.Arrays;
/**
* Mapper class that executes the DistCp copy operation.
* Implements the o.a.h.mapreduce.Mapper<> interface.
*/
public class CopyMapper extends Mapper<Text, FileStatus, Text, Text> {
/**
* Hadoop counters for the DistCp CopyMapper.
* (These have been kept identical to the old DistCp,
* for backward compatibility.)
*/
public static enum Counter {
COPY, // Number of files received by the mapper for copy.
SKIP, // Number of files skipped.
FAIL, // Number of files that failed to be copied.
BYTESCOPIED, // Number of bytes actually copied by the copy-mapper, total.
BYTESEXPECTED,// Number of bytes expected to be copied.
BYTESFAILED, // Number of bytes that failed to be copied.
BYTESSKIPPED, // Number of bytes that were skipped from copy.
}
private static Log LOG = LogFactory.getLog(CopyMapper.class);
private Configuration conf;
private boolean syncFolders = false;
private boolean ignoreFailures = false;
private boolean skipCrc = false;
private boolean overWrite = false;
private EnumSet<FileAttribute> preserve = EnumSet.noneOf(FileAttribute.class);
private FileSystem targetFS = null;
private Path targetWorkPath = null;
/**
* Implementation of the Mapper::setup() method. This extracts the DistCp-
* options specified in the Job's configuration, to set up the Job.
* @param context Mapper's context.
* @throws IOException On IO failure.
* @throws InterruptedException If the job is interrupted.
*/
@Override
public void setup(Context context) throws IOException, InterruptedException {
conf = context.getConfiguration();
syncFolders = conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false);
ignoreFailures = conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false);
skipCrc = conf.getBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), false);
overWrite = conf.getBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), false);
preserve = DistCpUtils.unpackAttributes(conf.get(DistCpOptionSwitch.
PRESERVE_STATUS.getConfigLabel()));
targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
Path targetFinalPath = new Path(conf.get(
DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
targetFS = targetFinalPath.getFileSystem(conf);
if (targetFS.exists(targetFinalPath) && targetFS.isFile(targetFinalPath)) {
overWrite = true; // When target is an existing file, overwrite it.
}
if (conf.get(DistCpConstants.CONF_LABEL_SSL_CONF) != null) {
initializeSSLConf(context);
}
}
/**
* Initialize SSL Config if same is set in conf
*
* @throws IOException - If any
*/
private void initializeSSLConf(Context context) throws IOException {
LOG.info("Initializing SSL configuration");
String workDir = conf.get(JobContext.JOB_LOCAL_DIR) + "/work";
Path[] cacheFiles = context.getLocalCacheFiles();
Configuration sslConfig = new Configuration(false);
String sslConfFileName = conf.get(DistCpConstants.CONF_LABEL_SSL_CONF);
Path sslClient = findCacheFile(cacheFiles, sslConfFileName);
if (sslClient == null) {
LOG.warn("SSL Client config file not found. Was looking for " + sslConfFileName +
" in " + Arrays.toString(cacheFiles));
return;
}
sslConfig.addResource(sslClient);
String trustStoreFile = conf.get("ssl.client.truststore.location");
Path trustStorePath = findCacheFile(cacheFiles, trustStoreFile);
sslConfig.set("ssl.client.truststore.location", trustStorePath.toString());
String keyStoreFile = conf.get("ssl.client.keystore.location");
Path keyStorePath = findCacheFile(cacheFiles, keyStoreFile);
sslConfig.set("ssl.client.keystore.location", keyStorePath.toString());
try {
OutputStream out = new FileOutputStream(workDir + "/" + sslConfFileName);
try {
sslConfig.writeXml(out);
} finally {
out.close();
}
conf.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfFileName);
} catch (IOException e) {
LOG.warn("Unable to write out the ssl configuration. " +
"Will fall back to default ssl-client.xml in class path, if there is one", e);
}
}
/**
* Find entry from distributed cache
*
* @param cacheFiles - All localized cache files
* @param fileName - fileName to search
* @return Path of the filename if found, else null
*/
private Path findCacheFile(Path[] cacheFiles, String fileName) {
if (cacheFiles != null && cacheFiles.length > 0) {
for (Path file : cacheFiles) {
if (file.getName().equals(fileName)) {
return file;
}
}
}
return null;
}
/**
* Implementation of the Mapper<>::map(). Does the copy.
* @param relPath The target path.
* @param sourceFileStatus The source path.
* @throws IOException
*/
@Override
public void map(Text relPath, FileStatus sourceFileStatus, Context context)
throws IOException, InterruptedException {
Path sourcePath = sourceFileStatus.getPath();
if (LOG.isDebugEnabled())
LOG.debug("DistCpMapper::map(): Received " + sourcePath + ", " + relPath);
Path target = new Path(targetWorkPath.makeQualified(targetFS.getUri(),
targetFS.getWorkingDirectory()) + relPath.toString());
EnumSet<DistCpOptions.FileAttribute> fileAttributes
= getFileAttributeSettings(context);
final String description = "Copying " + sourcePath + " to " + target;
context.setStatus(description);
LOG.info(description);
try {
FileStatus sourceCurrStatus;
FileSystem sourceFS;
try {
sourceFS = sourcePath.getFileSystem(conf);
sourceCurrStatus = sourceFS.getFileStatus(sourcePath);
} catch (FileNotFoundException e) {
throw new IOException(new RetriableFileCopyCommand.CopyReadException(e));
}
FileStatus targetStatus = null;
try {
targetStatus = targetFS.getFileStatus(target);
} catch (FileNotFoundException ignore) {
if (LOG.isDebugEnabled())
LOG.debug("Path could not be found: " + target, ignore);
}
if (targetStatus != null && (targetStatus.isDirectory() != sourceCurrStatus.isDirectory())) {
throw new IOException("Can't replace " + target + ". Target is " +
getFileType(targetStatus) + ", Source is " + getFileType(sourceCurrStatus));
}
if (sourceCurrStatus.isDirectory()) {
createTargetDirsWithRetry(description, target, context);
return;
}
if (skipFile(sourceFS, sourceCurrStatus, target)) {
LOG.info("Skipping copy of " + sourceCurrStatus.getPath()
+ " to " + target);
updateSkipCounters(context, sourceCurrStatus);
context.write(null, new Text("SKIP: " + sourceCurrStatus.getPath()));
}
else {
copyFileWithRetry(description, sourceCurrStatus, target, context,
fileAttributes);
}
DistCpUtils.preserve(target.getFileSystem(conf), target,
sourceCurrStatus, fileAttributes);
} catch (IOException exception) {
handleFailures(exception, sourceFileStatus, target, context);
}
}
private String getFileType(FileStatus fileStatus) {
return fileStatus == null ? "N/A" : (fileStatus.isDirectory() ? "dir" : "file");
}
private static EnumSet<DistCpOptions.FileAttribute>
getFileAttributeSettings(Mapper.Context context) {
String attributeString = context.getConfiguration().get(
DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel());
return DistCpUtils.unpackAttributes(attributeString);
}
private void copyFileWithRetry(String description, FileStatus sourceFileStatus,
Path target, Context context,
EnumSet<DistCpOptions.FileAttribute> fileAttributes) throws IOException {
long bytesCopied;
try {
bytesCopied = (Long)new RetriableFileCopyCommand(description)
.execute(sourceFileStatus, target, context, fileAttributes);
} catch (Exception e) {
context.setStatus("Copy Failure: " + sourceFileStatus.getPath());
throw new IOException("File copy failed: " + sourceFileStatus.getPath() +
" --> " + target, e);
}
incrementCounter(context, Counter.BYTESEXPECTED, sourceFileStatus.getLen());
incrementCounter(context, Counter.BYTESCOPIED, bytesCopied);
incrementCounter(context, Counter.COPY, 1);
}
private void createTargetDirsWithRetry(String description,
Path target, Context context) throws IOException {
try {
new RetriableDirectoryCreateCommand(description).execute(target, context);
} catch (Exception e) {
throw new IOException("mkdir failed for " + target, e);
}
incrementCounter(context, Counter.COPY, 1);
}
private static void updateSkipCounters(Context context,
FileStatus sourceFile) {
incrementCounter(context, Counter.SKIP, 1);
incrementCounter(context, Counter.BYTESSKIPPED, sourceFile.getLen());
}
private void handleFailures(IOException exception,
FileStatus sourceFileStatus, Path target,
Context context) throws IOException, InterruptedException {
LOG.error("Failure in copying " + sourceFileStatus.getPath() + " to " +
target, exception);
if (ignoreFailures && exception.getCause() instanceof
RetriableFileCopyCommand.CopyReadException) {
incrementCounter(context, Counter.FAIL, 1);
incrementCounter(context, Counter.BYTESFAILED, sourceFileStatus.getLen());
context.write(null, new Text("FAIL: " + sourceFileStatus.getPath() + " - " +
StringUtils.stringifyException(exception)));
}
else
throw exception;
}
private static void incrementCounter(Context context, Counter counter,
long value) {
context.getCounter(counter).increment(value);
}
private boolean skipFile(FileSystem sourceFS, FileStatus source, Path target)
throws IOException {
return targetFS.exists(target)
&& !overWrite
&& !mustUpdate(sourceFS, source, target);
}
private boolean mustUpdate(FileSystem sourceFS, FileStatus source, Path target)
throws IOException {
final FileStatus targetFileStatus = targetFS.getFileStatus(target);
return syncFolders
&& (
targetFileStatus.getLen() != source.getLen()
|| (!skipCrc &&
!DistCpUtils.checksumsAreEqual(sourceFS,
source.getPath(), targetFS, target))
|| (source.getBlockSize() != targetFileStatus.getBlockSize() &&
preserve.contains(FileAttribute.BLOCKSIZE))
);
}
}

View File

@ -0,0 +1,124 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.tools.DistCpConstants;
import java.io.IOException;
/**
* The CopyOutputFormat is the Hadoop OutputFormat used in DistCp.
* It sets up the Job's Configuration (in the Job-Context) with the settings
* for the work-directory, final commit-directory, etc. It also sets the right
* output-committer.
* @param <K>
* @param <V>
*/
public class CopyOutputFormat<K, V> extends TextOutputFormat<K, V> {
/**
* Setter for the working directory for DistCp (where files will be copied
* before they are moved to the final commit-directory.)
* @param job The Job on whose configuration the working-directory is to be set.
* @param workingDirectory The path to use as the working directory.
*/
public static void setWorkingDirectory(Job job, Path workingDirectory) {
job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH,
workingDirectory.toString());
}
/**
* Setter for the final directory for DistCp (where files copied will be
* moved, atomically.)
* @param job The Job on whose configuration the working-directory is to be set.
* @param commitDirectory The path to use for final commit.
*/
public static void setCommitDirectory(Job job, Path commitDirectory) {
job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH,
commitDirectory.toString());
}
/**
* Getter for the working directory.
* @param job The Job from whose configuration the working-directory is to
* be retrieved.
* @return The working-directory Path.
*/
public static Path getWorkingDirectory(Job job) {
return getWorkingDirectory(job.getConfiguration());
}
private static Path getWorkingDirectory(Configuration conf) {
String workingDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH);
if (workingDirectory == null || workingDirectory.isEmpty()) {
return null;
} else {
return new Path(workingDirectory);
}
}
/**
* Getter for the final commit-directory.
* @param job The Job from whose configuration the commit-directory is to be
* retrieved.
* @return The commit-directory Path.
*/
public static Path getCommitDirectory(Job job) {
return getCommitDirectory(job.getConfiguration());
}
private static Path getCommitDirectory(Configuration conf) {
String commitDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH);
if (commitDirectory == null || commitDirectory.isEmpty()) {
return null;
} else {
return new Path(commitDirectory);
}
}
/** @inheritDoc */
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
return new CopyCommitter(getOutputPath(context), context);
}
/** @inheritDoc */
@Override
public void checkOutputSpecs(JobContext context) throws IOException {
Configuration conf = context.getConfiguration();
if (getCommitDirectory(conf) == null) {
throw new IllegalStateException("Commit directory not configured");
}
Path workingPath = getWorkingDirectory(conf);
if (workingPath == null) {
throw new IllegalStateException("Working directory not configured");
}
// get delegation token for outDir's file system
TokenCache.obtainTokensForNamenodes(context.getCredentials(),
new Path[] {workingPath}, conf);
}
}

View File

@ -0,0 +1,56 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred;
import org.apache.hadoop.tools.util.RetriableCommand;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapreduce.Mapper;
/**
* This class extends Retriable command to implement the creation of directories
* with retries on failure.
*/
public class RetriableDirectoryCreateCommand extends RetriableCommand {
/**
* Constructor, taking a description of the action.
* @param description Verbose description of the copy operation.
*/
public RetriableDirectoryCreateCommand(String description) {
super(description);
}
/**
* Implementation of RetriableCommand::doExecute().
* This implements the actual mkdirs() functionality.
* @param arguments Argument-list to the command.
* @return Boolean. True, if the directory could be created successfully.
* @throws Exception IOException, on failure to create the directory.
*/
@Override
protected Object doExecute(Object... arguments) throws Exception {
assert arguments.length == 2 : "Unexpected argument list.";
Path target = (Path)arguments[0];
Mapper.Context context = (Mapper.Context)arguments[1];
FileSystem targetFS = target.getFileSystem(context.getConfiguration());
return targetFS.mkdirs(target);
}
}

View File

@ -0,0 +1,245 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred;
import org.apache.hadoop.tools.util.RetriableCommand;
import org.apache.hadoop.tools.util.ThrottledInputStream;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.tools.DistCpOptions.*;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.*;
import java.util.EnumSet;
/**
* This class extends RetriableCommand to implement the copy of files,
* with retries on failure.
*/
public class RetriableFileCopyCommand extends RetriableCommand {
private static Log LOG = LogFactory.getLog(RetriableFileCopyCommand.class);
private static int BUFFER_SIZE = 8 * 1024;
/**
* Constructor, taking a description of the action.
* @param description Verbose description of the copy operation.
*/
public RetriableFileCopyCommand(String description) {
super(description);
}
/**
* Implementation of RetriableCommand::doExecute().
* This is the actual copy-implementation.
* @param arguments Argument-list to the command.
* @return Number of bytes copied.
* @throws Exception: CopyReadException, if there are read-failures. All other
* failures are IOExceptions.
*/
@SuppressWarnings("unchecked")
@Override
protected Object doExecute(Object... arguments) throws Exception {
assert arguments.length == 4 : "Unexpected argument list.";
FileStatus source = (FileStatus)arguments[0];
assert !source.isDirectory() : "Unexpected file-status. Expected file.";
Path target = (Path)arguments[1];
Mapper.Context context = (Mapper.Context)arguments[2];
EnumSet<FileAttribute> fileAttributes
= (EnumSet<FileAttribute>)arguments[3];
return doCopy(source, target, context, fileAttributes);
}
private long doCopy(FileStatus sourceFileStatus, Path target,
Mapper.Context context,
EnumSet<FileAttribute> fileAttributes)
throws IOException {
Path tmpTargetPath = getTmpFile(target, context);
final Configuration configuration = context.getConfiguration();
FileSystem targetFS = target.getFileSystem(configuration);
try {
if (LOG.isDebugEnabled()) {
LOG.debug("Copying " + sourceFileStatus.getPath() + " to " + target);
LOG.debug("Tmp-file path: " + tmpTargetPath);
}
FileSystem sourceFS = sourceFileStatus.getPath().getFileSystem(
configuration);
long bytesRead = copyToTmpFile(tmpTargetPath, targetFS, sourceFileStatus,
context, fileAttributes);
compareFileLengths(sourceFileStatus, tmpTargetPath, configuration, bytesRead);
compareCheckSums(sourceFS, sourceFileStatus.getPath(), targetFS, tmpTargetPath);
promoteTmpToTarget(tmpTargetPath, target, targetFS);
return bytesRead;
} finally {
if (targetFS.exists(tmpTargetPath))
targetFS.delete(tmpTargetPath, false);
}
}
private long copyToTmpFile(Path tmpTargetPath, FileSystem targetFS,
FileStatus sourceFileStatus, Mapper.Context context,
EnumSet<FileAttribute> fileAttributes)
throws IOException {
OutputStream outStream = new BufferedOutputStream(targetFS.create(
tmpTargetPath, true, BUFFER_SIZE,
getReplicationFactor(fileAttributes, sourceFileStatus, targetFS),
getBlockSize(fileAttributes, sourceFileStatus, targetFS), context));
return copyBytes(sourceFileStatus, outStream, BUFFER_SIZE, true, context);
}
private void compareFileLengths(FileStatus sourceFileStatus, Path target,
Configuration configuration, long bytesRead)
throws IOException {
final Path sourcePath = sourceFileStatus.getPath();
FileSystem fs = sourcePath.getFileSystem(configuration);
if (fs.getFileStatus(sourcePath).getLen() != bytesRead)
throw new IOException("Mismatch in length of source:" + sourcePath
+ " and target:" + target);
}
private void compareCheckSums(FileSystem sourceFS, Path source,
FileSystem targetFS, Path target)
throws IOException {
if (!DistCpUtils.checksumsAreEqual(sourceFS, source, targetFS, target))
throw new IOException("Check-sum mismatch between "
+ source + " and " + target);
}
//If target file exists and unable to delete target - fail
//If target doesn't exist and unable to create parent folder - fail
//If target is successfully deleted and parent exists, if rename fails - fail
private void promoteTmpToTarget(Path tmpTarget, Path target, FileSystem fs)
throws IOException {
if ((fs.exists(target) && !fs.delete(target, false))
|| (!fs.exists(target.getParent()) && !fs.mkdirs(target.getParent()))
|| !fs.rename(tmpTarget, target)) {
throw new IOException("Failed to promote tmp-file:" + tmpTarget
+ " to: " + target);
}
}
private Path getTmpFile(Path target, Mapper.Context context) {
Path targetWorkPath = new Path(context.getConfiguration().
get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
Path root = target.equals(targetWorkPath)? targetWorkPath.getParent() : targetWorkPath;
LOG.info("Creating temp file: " +
new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString()));
return new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString());
}
private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream,
int bufferSize, boolean mustCloseStream,
Mapper.Context context) throws IOException {
Path source = sourceFileStatus.getPath();
byte buf[] = new byte[bufferSize];
ThrottledInputStream inStream = null;
long totalBytesRead = 0;
try {
inStream = getInputStream(source, context.getConfiguration());
int bytesRead = readBytes(inStream, buf);
while (bytesRead >= 0) {
totalBytesRead += bytesRead;
outStream.write(buf, 0, bytesRead);
updateContextStatus(totalBytesRead, context, sourceFileStatus);
bytesRead = inStream.read(buf);
}
} finally {
if (mustCloseStream)
IOUtils.cleanup(LOG, outStream, inStream);
}
return totalBytesRead;
}
private void updateContextStatus(long totalBytesRead, Mapper.Context context,
FileStatus sourceFileStatus) {
StringBuilder message = new StringBuilder(DistCpUtils.getFormatter()
.format(totalBytesRead * 100.0f / sourceFileStatus.getLen()));
message.append("% ")
.append(description).append(" [")
.append(DistCpUtils.getStringDescriptionFor(totalBytesRead))
.append('/')
.append(DistCpUtils.getStringDescriptionFor(sourceFileStatus.getLen()))
.append(']');
context.setStatus(message.toString());
}
private static int readBytes(InputStream inStream, byte buf[])
throws IOException {
try {
return inStream.read(buf);
}
catch (IOException e) {
throw new CopyReadException(e);
}
}
private static ThrottledInputStream getInputStream(Path path, Configuration conf)
throws IOException {
try {
FileSystem fs = path.getFileSystem(conf);
long bandwidthMB = conf.getInt(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
DistCpConstants.DEFAULT_BANDWIDTH_MB);
return new ThrottledInputStream(new BufferedInputStream(fs.open(path)),
bandwidthMB * 1024 * 1024);
}
catch (IOException e) {
throw new CopyReadException(e);
}
}
private static short getReplicationFactor(
EnumSet<FileAttribute> fileAttributes,
FileStatus sourceFile, FileSystem targetFS) {
return fileAttributes.contains(FileAttribute.REPLICATION)?
sourceFile.getReplication() : targetFS.getDefaultReplication();
}
private static long getBlockSize(
EnumSet<FileAttribute> fileAttributes,
FileStatus sourceFile, FileSystem targetFS) {
return fileAttributes.contains(FileAttribute.BLOCKSIZE)?
sourceFile.getBlockSize() : targetFS.getDefaultBlockSize();
}
/**
* Special subclass of IOException. This is used to distinguish read-operation
* failures from other kinds of IOExceptions.
* The failure to read from source is dealt with specially, in the CopyMapper.
* Such failures may be skipped if the DistCpOptions indicate so.
* Write failures are intolerable, and amount to CopyMapper failure.
*/
public static class CopyReadException extends IOException {
public CopyReadException(Throwable rootCause) {
super(rootCause);
}
}
}

View File

@ -0,0 +1,169 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.conf.Configuration;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
/**
* UniformSizeInputFormat extends the InputFormat<> class, to produce
* input-splits for DistCp.
* It looks at the copy-listing and groups the contents into input-splits such
* that the total-number of bytes to be copied for each input split is
* uniform.
*/
public class UniformSizeInputFormat extends InputFormat<Text, FileStatus> {
private static final Log LOG
= LogFactory.getLog(UniformSizeInputFormat.class);
/**
* Implementation of InputFormat::getSplits(). Returns a list of InputSplits,
* such that the number of bytes to be copied for all the splits are
* approximately equal.
* @param context JobContext for the job.
* @return The list of uniformly-distributed input-splits.
* @throws IOException: On failure.
* @throws InterruptedException
*/
@Override
public List<InputSplit> getSplits(JobContext context)
throws IOException, InterruptedException {
Configuration configuration = context.getConfiguration();
int numSplits = DistCpUtils.getInt(configuration,
JobContext.NUM_MAPS);
if (numSplits == 0) return new ArrayList<InputSplit>();
return getSplits(configuration, numSplits,
DistCpUtils.getLong(configuration,
DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED));
}
private List<InputSplit> getSplits(Configuration configuration, int numSplits,
long totalSizeBytes) throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
FileStatus srcFileStatus = new FileStatus();
Text srcRelPath = new Text();
long currentSplitSize = 0;
long lastSplitStart = 0;
long lastPosition = 0;
final Path listingFilePath = getListingFilePath(configuration);
if (LOG.isDebugEnabled()) {
LOG.debug("Average bytes per map: " + nBytesPerSplit +
", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
}
SequenceFile.Reader reader=null;
try {
reader = getListingFileReader(configuration);
while (reader.next(srcRelPath, srcFileStatus)) {
// If adding the current file would cause the bytes per map to exceed
// limit. Add the current file to new split
if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
lastPosition - lastSplitStart, null);
if (LOG.isDebugEnabled()) {
LOG.debug ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
}
splits.add(split);
lastSplitStart = lastPosition;
currentSplitSize = 0;
}
currentSplitSize += srcFileStatus.getLen();
lastPosition = reader.getPosition();
}
if (lastPosition > lastSplitStart) {
FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
lastPosition - lastSplitStart, null);
if (LOG.isDebugEnabled()) {
LOG.info ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
}
splits.add(split);
}
} finally {
IOUtils.closeStream(reader);
}
return splits;
}
private static Path getListingFilePath(Configuration configuration) {
final String listingFilePathString =
configuration.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
assert !listingFilePathString.equals("")
: "Couldn't find listing file. Invalid input.";
return new Path(listingFilePathString);
}
private SequenceFile.Reader getListingFileReader(Configuration configuration) {
final Path listingFilePath = getListingFilePath(configuration);
try {
final FileSystem fileSystem = listingFilePath.getFileSystem(configuration);
if (!fileSystem.exists(listingFilePath))
throw new IllegalArgumentException("Listing file doesn't exist at: "
+ listingFilePath);
return new SequenceFile.Reader(configuration,
SequenceFile.Reader.file(listingFilePath));
}
catch (IOException exception) {
LOG.error("Couldn't find listing file at: " + listingFilePath, exception);
throw new IllegalArgumentException("Couldn't find listing-file at: "
+ listingFilePath, exception);
}
}
/**
* Implementation of InputFormat::createRecordReader().
* @param split The split for which the RecordReader is sought.
* @param context The context of the current task-attempt.
* @return A SequenceFileRecordReader instance, (since the copy-listing is a
* simple sequence-file.)
* @throws IOException
* @throws InterruptedException
*/
@Override
public RecordReader<Text, FileStatus> createRecordReader(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
return new SequenceFileRecordReader<Text, FileStatus>();
}
}

View File

@ -0,0 +1,246 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred.lib;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import java.io.IOException;
/**
* The DynamicInputChunk represents a single chunk of work, when used in
* conjunction with the DynamicInputFormat and the DynamicRecordReader.
* The records in the DynamicInputFormat's input-file are split across various
* DynamicInputChunks. Each one is claimed and processed in an iteration of
* a dynamic-mapper. When a DynamicInputChunk has been exhausted, the faster
* mapper may claim another and process it, until there are no more to be
* consumed.
*/
class DynamicInputChunk<K, V> {
private static Log LOG = LogFactory.getLog(DynamicInputChunk.class);
private static Configuration configuration;
private static Path chunkRootPath;
private static String chunkFilePrefix;
private static int numChunksLeft = -1; // Un-initialized before 1st dir-scan.
private static FileSystem fs;
private Path chunkFilePath;
private SequenceFileRecordReader<K, V> reader;
private SequenceFile.Writer writer;
private static void initializeChunkInvariants(Configuration config)
throws IOException {
configuration = config;
Path listingFilePath = new Path(getListingFilePath(configuration));
chunkRootPath = new Path(listingFilePath.getParent(), "chunkDir");
fs = chunkRootPath.getFileSystem(configuration);
chunkFilePrefix = listingFilePath.getName() + ".chunk.";
}
private static String getListingFilePath(Configuration configuration) {
final String listingFileString = configuration.get(
DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
assert !listingFileString.equals("") : "Listing file not found.";
return listingFileString;
}
private static boolean areInvariantsInitialized() {
return chunkRootPath != null;
}
private DynamicInputChunk(String chunkId, Configuration configuration)
throws IOException {
if (!areInvariantsInitialized())
initializeChunkInvariants(configuration);
chunkFilePath = new Path(chunkRootPath, chunkFilePrefix + chunkId);
openForWrite();
}
private void openForWrite() throws IOException {
writer = SequenceFile.createWriter(
chunkFilePath.getFileSystem(configuration), configuration,
chunkFilePath, Text.class, FileStatus.class,
SequenceFile.CompressionType.NONE);
}
/**
* Factory method to create chunk-files for writing to.
* (For instance, when the DynamicInputFormat splits the input-file into
* chunks.)
* @param chunkId String to identify the chunk.
* @param configuration Configuration, describing the location of the listing-
* file, file-system for the map-job, etc.
* @return A DynamicInputChunk, corresponding to a chunk-file, with the name
* incorporating the chunk-id.
* @throws IOException Exception on failure to create the chunk.
*/
public static DynamicInputChunk createChunkForWrite(String chunkId,
Configuration configuration) throws IOException {
return new DynamicInputChunk(chunkId, configuration);
}
/**
* Method to write records into a chunk.
* @param key Key from the listing file.
* @param value Corresponding value from the listing file.
* @throws IOException Exception onf failure to write to the file.
*/
public void write(Text key, FileStatus value) throws IOException {
writer.append(key, value);
}
/**
* Closes streams opened to the chunk-file.
*/
public void close() {
IOUtils.cleanup(LOG, reader, writer);
}
/**
* Reassigns the chunk to a specified Map-Task, for consumption.
* @param taskId The Map-Task to which a the chunk is to be reassigned.
* @throws IOException Exception on failure to reassign.
*/
public void assignTo(TaskID taskId) throws IOException {
Path newPath = new Path(chunkRootPath, taskId.toString());
if (!fs.rename(chunkFilePath, newPath)) {
LOG.warn(chunkFilePath + " could not be assigned to " + taskId);
}
}
private DynamicInputChunk(Path chunkFilePath,
TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
if (!areInvariantsInitialized())
initializeChunkInvariants(taskAttemptContext.getConfiguration());
this.chunkFilePath = chunkFilePath;
openForRead(taskAttemptContext);
}
private void openForRead(TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
reader = new SequenceFileRecordReader<K, V>();
reader.initialize(new FileSplit(chunkFilePath, 0,
DistCpUtils.getFileSize(chunkFilePath, configuration), null),
taskAttemptContext);
}
/**
* Factory method that
* 1. acquires a chunk for the specified map-task attempt
* 2. returns a DynamicInputChunk associated with the acquired chunk-file.
* @param taskAttemptContext The attempt-context for the map task that's
* trying to acquire a chunk.
* @return The acquired dynamic-chunk. The chunk-file is renamed to the
* attempt-id (from the attempt-context.)
* @throws IOException Exception on failure.
* @throws InterruptedException Exception on failure.
*/
public static DynamicInputChunk acquire(TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
if (!areInvariantsInitialized())
initializeChunkInvariants(taskAttemptContext.getConfiguration());
String taskId
= taskAttemptContext.getTaskAttemptID().getTaskID().toString();
Path acquiredFilePath = new Path(chunkRootPath, taskId);
if (fs.exists(acquiredFilePath)) {
LOG.info("Acquiring pre-assigned chunk: " + acquiredFilePath);
return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
}
for (FileStatus chunkFile : getListOfChunkFiles()) {
if (fs.rename(chunkFile.getPath(), acquiredFilePath)) {
LOG.info(taskId + " acquired " + chunkFile.getPath());
return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
}
else
LOG.warn(taskId + " could not acquire " + chunkFile.getPath());
}
return null;
}
/**
* Method to be called to relinquish an acquired chunk. All streams open to
* the chunk are closed, and the chunk-file is deleted.
* @throws IOException Exception thrown on failure to release (i.e. delete)
* the chunk file.
*/
public void release() throws IOException {
close();
if (!fs.delete(chunkFilePath, false)) {
LOG.error("Unable to release chunk at path: " + chunkFilePath);
throw new IOException("Unable to release chunk at path: " + chunkFilePath);
}
}
static FileStatus [] getListOfChunkFiles() throws IOException {
Path chunkFilePattern = new Path(chunkRootPath, chunkFilePrefix + "*");
FileStatus chunkFiles[] = fs.globStatus(chunkFilePattern);
numChunksLeft = chunkFiles.length;
return chunkFiles;
}
/**
* Getter for the chunk-file's path, on HDFS.
* @return The qualified path to the chunk-file.
*/
public Path getPath() {
return chunkFilePath;
}
/**
* Getter for the record-reader, opened to the chunk-file.
* @return Opened Sequence-file reader.
*/
public SequenceFileRecordReader<K,V> getReader() {
assert reader != null : "Reader un-initialized!";
return reader;
}
/**
* Getter for the number of chunk-files left in the chunk-file directory.
* Useful to determine how many chunks (and hence, records) are left to be
* processed.
* @return Before the first scan of the directory, the number returned is -1.
* Otherwise, the number of chunk-files seen from the last scan is returned.
*/
public static int getNumChunksLeft() {
return numChunksLeft;
}
}

View File

@ -0,0 +1,292 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.mapred.lib;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;
import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
/**
* DynamicInputFormat implements the "Worker pattern" for DistCp.
* Rather than to split up the copy-list into a set of static splits,
* the DynamicInputFormat does the following:
* 1. Splits the copy-list into small chunks on the DFS.
* 2. Creates a set of empty "dynamic" splits, that each consume as many chunks
* as it can.
* This arrangement ensures that a single slow mapper won't slow down the entire
* job (since the slack will be picked up by other mappers, who consume more
* chunks.)
* By varying the split-ratio, one can vary chunk sizes to achieve different
* performance characteristics.
*/
public class DynamicInputFormat<K, V> extends InputFormat<K, V> {
private static final Log LOG = LogFactory.getLog(DynamicInputFormat.class);
private static final String CONF_LABEL_LISTING_SPLIT_RATIO
= "mapred.listing.split.ratio";
private static final String CONF_LABEL_NUM_SPLITS
= "mapred.num.splits";
private static final String CONF_LABEL_NUM_ENTRIES_PER_CHUNK
= "mapred.num.entries.per.chunk";
/**
* Implementation of InputFormat::getSplits(). This method splits up the
* copy-listing file into chunks, and assigns the first batch to different
* tasks.
* @param jobContext JobContext for the map job.
* @return The list of (empty) dynamic input-splits.
* @throws IOException, on failure.
* @throws InterruptedException
*/
@Override
public List<InputSplit> getSplits(JobContext jobContext)
throws IOException, InterruptedException {
LOG.info("DynamicInputFormat: Getting splits for job:"
+ jobContext.getJobID());
return createSplits(jobContext,
splitCopyListingIntoChunksWithShuffle(jobContext));
}
private List<InputSplit> createSplits(JobContext jobContext,
List<DynamicInputChunk> chunks)
throws IOException {
int numMaps = getNumMapTasks(jobContext.getConfiguration());
final int nSplits = Math.min(numMaps, chunks.size());
List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
for (int i=0; i< nSplits; ++i) {
TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
chunks.get(i).assignTo(taskId);
splits.add(new FileSplit(chunks.get(i).getPath(), 0,
// Setting non-zero length for FileSplit size, to avoid a possible
// future when 0-sized file-splits are considered "empty" and skipped
// over.
MIN_RECORDS_PER_CHUNK,
null));
}
DistCpUtils.publish(jobContext.getConfiguration(),
CONF_LABEL_NUM_SPLITS, splits.size());
return splits;
}
private static int N_CHUNKS_OPEN_AT_ONCE_DEFAULT = 16;
private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle
(JobContext context) throws IOException {
final Configuration configuration = context.getConfiguration();
int numRecords = getNumberOfRecords(configuration);
int numMaps = getNumMapTasks(configuration);
// Number of chunks each map will process, on average.
int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
validateNumChunksUsing(splitRatio, numMaps);
int numEntriesPerChunk = (int)Math.ceil((float)numRecords
/(splitRatio * numMaps));
DistCpUtils.publish(context.getConfiguration(),
CONF_LABEL_NUM_ENTRIES_PER_CHUNK,
numEntriesPerChunk);
final int nChunksTotal = (int)Math.ceil((float)numRecords/numEntriesPerChunk);
int nChunksOpenAtOnce
= Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);
Path listingPath = getListingFilePath(configuration);
SequenceFile.Reader reader
= new SequenceFile.Reader(configuration,
SequenceFile.Reader.file(listingPath));
List<DynamicInputChunk> openChunks
= new ArrayList<DynamicInputChunk>();
List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();
FileStatus fileStatus = new FileStatus();
Text relPath = new Text();
int recordCounter = 0;
int chunkCount = 0;
try {
while (reader.next(relPath, fileStatus)) {
if (recordCounter % (nChunksOpenAtOnce*numEntriesPerChunk) == 0) {
// All chunks full. Create new chunk-set.
closeAll(openChunks);
chunksFinal.addAll(openChunks);
openChunks = createChunks(
configuration, chunkCount, nChunksTotal, nChunksOpenAtOnce);
chunkCount += openChunks.size();
nChunksOpenAtOnce = openChunks.size();
recordCounter = 0;
}
// Shuffle into open chunks.
openChunks.get(recordCounter%nChunksOpenAtOnce).write(relPath, fileStatus);
++recordCounter;
}
} finally {
closeAll(openChunks);
chunksFinal.addAll(openChunks);
IOUtils.closeStream(reader);
}
LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
return chunksFinal;
}
private static void validateNumChunksUsing(int splitRatio, int numMaps)
throws IOException {
if (splitRatio * numMaps > MAX_CHUNKS_TOLERABLE)
throw new IOException("Too many chunks created with splitRatio:"
+ splitRatio + ", numMaps:" + numMaps
+ ". Reduce numMaps or decrease split-ratio to proceed.");
}
private static void closeAll(List<DynamicInputChunk> chunks) {
for (DynamicInputChunk chunk: chunks)
chunk.close();
}
private static List<DynamicInputChunk> createChunks(Configuration config,
int chunkCount, int nChunksTotal, int nChunksOpenAtOnce)
throws IOException {
List<DynamicInputChunk> chunks = new ArrayList<DynamicInputChunk>();
int chunkIdUpperBound
= Math.min(nChunksTotal, chunkCount + nChunksOpenAtOnce);
// If there will be fewer than nChunksOpenAtOnce chunks left after
// the current batch of chunks, fold the remaining chunks into
// the current batch.
if (nChunksTotal - chunkIdUpperBound < nChunksOpenAtOnce)
chunkIdUpperBound = nChunksTotal;
for (int i=chunkCount; i < chunkIdUpperBound; ++i)
chunks.add(createChunk(i, config));
return chunks;
}
private static DynamicInputChunk createChunk(int chunkId, Configuration config)
throws IOException {
return DynamicInputChunk.createChunkForWrite(String.format("%05d", chunkId),
config);
}
private static Path getListingFilePath(Configuration configuration) {
String listingFilePathString = configuration.get(
DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
assert !listingFilePathString.equals("") : "Listing file not found.";
Path listingFilePath = new Path(listingFilePathString);
try {
assert listingFilePath.getFileSystem(configuration)
.exists(listingFilePath) : "Listing file: " + listingFilePath +
" not found.";
} catch (IOException e) {
assert false : "Listing file: " + listingFilePath
+ " couldn't be accessed. " + e.getMessage();
}
return listingFilePath;
}
private static int getNumberOfRecords(Configuration configuration) {
return DistCpUtils.getInt(configuration,
DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
}
private static int getNumMapTasks(Configuration configuration) {
return DistCpUtils.getInt(configuration,
JobContext.NUM_MAPS);
}
private static int getListingSplitRatio(Configuration configuration,
int numMaps, int numPaths) {
return configuration.getInt(
CONF_LABEL_LISTING_SPLIT_RATIO,
getSplitRatio(numMaps, numPaths));
}
private static final int MAX_CHUNKS_TOLERABLE = 400;
private static final int MAX_CHUNKS_IDEAL = 100;
private static final int MIN_RECORDS_PER_CHUNK = 5;
private static final int SPLIT_RATIO_DEFAULT = 2;
/**
* Package private, for testability.
* @param nMaps The number of maps requested for.
* @param nRecords The number of records to be copied.
* @return The number of splits each map should handle, ideally.
*/
static int getSplitRatio(int nMaps, int nRecords) {
if (nMaps == 1) {
LOG.warn("nMaps == 1. Why use DynamicInputFormat?");
return 1;
}
if (nMaps > MAX_CHUNKS_IDEAL)
return SPLIT_RATIO_DEFAULT;
int nPickups = (int)Math.ceil((float)MAX_CHUNKS_IDEAL/nMaps);
int nRecordsPerChunk = (int)Math.ceil((float)nRecords/(nMaps*nPickups));
return nRecordsPerChunk < MIN_RECORDS_PER_CHUNK ?
SPLIT_RATIO_DEFAULT : nPickups;
}
static int getNumEntriesPerChunk(Configuration configuration) {
return DistCpUtils.getInt(configuration,
CONF_LABEL_NUM_ENTRIES_PER_CHUNK);
}
/**
* Implementation of Inputformat::createRecordReader().
* @param inputSplit The split for which the RecordReader is required.
* @param taskAttemptContext TaskAttemptContext for the current attempt.
* @return DynamicRecordReader instance.
* @throws IOException, on failure.
* @throws InterruptedException
*/
@Override
public RecordReader<K, V> createRecordReader(
InputSplit inputSplit,
TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
return new DynamicRecordReader<K, V>();
}
}

Some files were not shown because too many files have changed in this diff Show More