Merge r1234388 through r1236385 from 0.23.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23-PB@1236395 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
359c746ca7
|
@ -127,6 +127,17 @@
|
|||
<unpack>false</unpack>
|
||||
</binaries>
|
||||
</moduleSet>
|
||||
<moduleSet>
|
||||
<includes>
|
||||
<include>org.apache.hadoop:hadoop-mapreduce-client-jobclient</include>
|
||||
</includes>
|
||||
<binaries>
|
||||
<attachmentClassifier>tests</attachmentClassifier>
|
||||
<outputDirectory>share/hadoop/${hadoop.component}</outputDirectory>
|
||||
<includeDependencies>false</includeDependencies>
|
||||
<unpack>false</unpack>
|
||||
</binaries>
|
||||
</moduleSet>
|
||||
</moduleSets>
|
||||
<dependencySets>
|
||||
<dependencySet>
|
||||
|
|
|
@ -125,6 +125,21 @@ Release 0.23.1 - Unreleased
|
|||
|
||||
HADOOP-7975. Add LZ4 as an entry in the default codec list, missed by HADOOP-7657 (harsh)
|
||||
|
||||
HADOOP-7987. Support setting the run-as user in unsecure mode. (jitendra)
|
||||
|
||||
HADOOP-4515. Configuration#getBoolean must not be case sensitive. (Sho Shimauchi via harsh)
|
||||
|
||||
HADOOP-6490. Use StringUtils over String#replace in Path#normalizePath.
|
||||
(Uma Maheswara Rao G via harsh)
|
||||
|
||||
HADOOP-7574. Improve FSShell -stat, add user/group elements.
|
||||
(XieXianshan via harsh)
|
||||
|
||||
HADOOP-7736. Remove duplicate Path#normalizePath call. (harsh)
|
||||
|
||||
HADOOP-7919. Remove the unused hadoop.logfile.* properties from the
|
||||
core-default.xml file. (harsh)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
BUG FIXES
|
||||
|
@ -207,6 +222,9 @@ Release 0.23.1 - Unreleased
|
|||
HADOOP-7986. Adding config for MapReduce History Server protocol in
|
||||
hadoop-policy.xml for service level authorization. (Mahadev Konar via vinodkv)
|
||||
|
||||
HADOOP-7981. Improve documentation for org.apache.hadoop.io.compress.
|
||||
Decompressor.getRemaining (Jonathan Eagles via mahadev)
|
||||
|
||||
Release 0.23.0 - 2011-11-01
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -753,11 +753,6 @@
|
|||
|
||||
<section>
|
||||
<title> secondarynamenode </title>
|
||||
<note>
|
||||
The Secondary NameNode has been deprecated. Instead, consider using the
|
||||
<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Checkpoint+Node">Checkpoint Node</a> or
|
||||
<a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Backup+Node">Backup Node</a>.
|
||||
</note>
|
||||
<p>
|
||||
Runs the HDFS secondary
|
||||
namenode. See <a href="http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html#Secondary+NameNode">Secondary NameNode</a>
|
||||
|
|
|
@ -826,6 +826,12 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
|
|||
*/
|
||||
public boolean getBoolean(String name, boolean defaultValue) {
|
||||
String valueString = getTrimmed(name);
|
||||
if (null == valueString || "".equals(valueString)) {
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
valueString = valueString.toLowerCase();
|
||||
|
||||
if ("true".equals(valueString))
|
||||
return true;
|
||||
else if ("false".equals(valueString))
|
||||
|
|
|
@ -18,10 +18,12 @@
|
|||
|
||||
package org.apache.hadoop.fs;
|
||||
|
||||
import java.net.*;
|
||||
import java.io.*;
|
||||
import org.apache.avro.reflect.Stringable;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import org.apache.avro.reflect.Stringable;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -76,7 +78,7 @@ public class Path implements Comparable {
|
|||
}
|
||||
URI resolved = parentUri.resolve(child.uri);
|
||||
initialize(resolved.getScheme(), resolved.getAuthority(),
|
||||
normalizePath(resolved.getPath()), resolved.getFragment());
|
||||
resolved.getPath(), resolved.getFragment());
|
||||
}
|
||||
|
||||
private void checkPathArg( String path ) {
|
||||
|
@ -158,8 +160,8 @@ public class Path implements Comparable {
|
|||
|
||||
private String normalizePath(String path) {
|
||||
// remove double slashes & backslashes
|
||||
path = path.replace("//", "/");
|
||||
path = path.replace("\\", "/");
|
||||
path = StringUtils.replace(path, "//", "/");
|
||||
path = StringUtils.replace(path, "\\", "/");
|
||||
|
||||
// trim trailing slash from non-root path (ignoring windows drive)
|
||||
int minLength = hasWindowsDrive(path, true) ? 4 : 1;
|
||||
|
|
|
@ -32,9 +32,11 @@ import org.apache.hadoop.fs.FileStatus;
|
|||
* Print statistics about path in specified format.
|
||||
* Format sequences:
|
||||
* %b: Size of file in blocks
|
||||
* %g: Group name of owner
|
||||
* %n: Filename
|
||||
* %o: Block size
|
||||
* %r: replication
|
||||
* %u: User name of owner
|
||||
* %y: UTC date as "yyyy-MM-dd HH:mm:ss"
|
||||
* %Y: Milliseconds since January 1, 1970 UTC
|
||||
*/
|
||||
|
@ -50,8 +52,8 @@ class Stat extends FsCommand {
|
|||
public static final String USAGE = "[format] <path> ...";
|
||||
public static final String DESCRIPTION =
|
||||
"Print statistics about the file/directory at <path>\n" +
|
||||
"in the specified format. Format accepts filesize in blocks (%b), filename (%n),\n" +
|
||||
"block size (%o), replication (%r), modification date (%y, %Y)\n";
|
||||
"in the specified format. Format accepts filesize in blocks (%b), group name of owner(%g),\n" +
|
||||
"filename (%n), block size (%o), replication (%r), user name of owner(%u), modification date (%y, %Y)\n";
|
||||
|
||||
protected static final SimpleDateFormat timeFmt;
|
||||
static {
|
||||
|
@ -92,6 +94,9 @@ class Stat extends FsCommand {
|
|||
? "directory"
|
||||
: (stat.isFile() ? "regular file" : "symlink"));
|
||||
break;
|
||||
case 'g':
|
||||
buf.append(stat.getGroup());
|
||||
break;
|
||||
case 'n':
|
||||
buf.append(item.path.getName());
|
||||
break;
|
||||
|
@ -101,6 +106,9 @@ class Stat extends FsCommand {
|
|||
case 'r':
|
||||
buf.append(stat.getReplication());
|
||||
break;
|
||||
case 'u':
|
||||
buf.append(stat.getOwner());
|
||||
break;
|
||||
case 'y':
|
||||
buf.append(timeFmt.format(new Date(stat.getModificationTime())));
|
||||
break;
|
||||
|
|
|
@ -49,7 +49,7 @@ public interface Decompressor {
|
|||
public void setInput(byte[] b, int off, int len);
|
||||
|
||||
/**
|
||||
* Returns true if the input data buffer is empty and
|
||||
* Returns <code>true</code> if the input data buffer is empty and
|
||||
* {@link #setInput(byte[], int, int)} should be called to
|
||||
* provide more input.
|
||||
*
|
||||
|
@ -76,8 +76,11 @@ public interface Decompressor {
|
|||
public boolean needsDictionary();
|
||||
|
||||
/**
|
||||
* Returns true if the end of the decompressed
|
||||
* data output stream has been reached.
|
||||
* Returns <code>true</code> if the end of the decompressed
|
||||
* data output stream has been reached. Indicates a concatenated data stream
|
||||
* when finished() returns <code>true</code> and {@link #getRemaining()}
|
||||
* returns a positive value. finished() will be reset with the
|
||||
* {@link #reset()} method.
|
||||
* @return <code>true</code> if the end of the decompressed
|
||||
* data output stream has been reached.
|
||||
*/
|
||||
|
@ -98,15 +101,23 @@ public interface Decompressor {
|
|||
public int decompress(byte[] b, int off, int len) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the number of bytes remaining in the compressed-data buffer;
|
||||
* typically called after the decompressor has finished decompressing
|
||||
* the current gzip stream (a.k.a. "member").
|
||||
* Returns the number of bytes remaining in the compressed data buffer.
|
||||
* Indicates a concatenated data stream if {@link #finished()} returns
|
||||
* <code>true</code> and getRemaining() returns a positive value. If
|
||||
* {@link #finished()} returns <code>true</code> and getRemaining() returns
|
||||
* a zero value, indicates that the end of data stream has been reached and
|
||||
* is not a concatenated data stream.
|
||||
* @return The number of bytes remaining in the compressed data buffer.
|
||||
*/
|
||||
public int getRemaining();
|
||||
|
||||
/**
|
||||
* Resets decompressor and input and output buffers so that a new set of
|
||||
* input data can be processed.
|
||||
* input data can be processed. If {@link #finished()}} returns
|
||||
* <code>true</code> and {@link #getRemaining()} returns a positive value,
|
||||
* reset() is called before processing of the next data stream in the
|
||||
* concatenated data stream. {@link #finished()} will be reset and will
|
||||
* return <code>false</code> when reset() is called.
|
||||
*/
|
||||
public void reset();
|
||||
|
||||
|
|
|
@ -80,6 +80,7 @@ public class UserGroupInformation {
|
|||
* Percentage of the ticket window to use before we renew ticket.
|
||||
*/
|
||||
private static final float TICKET_RENEW_WINDOW = 0.80f;
|
||||
static final String HADOOP_USER_NAME = "HADOOP_USER_NAME";
|
||||
|
||||
/**
|
||||
* UgiMetrics maintains UGI activity statistics
|
||||
|
@ -137,7 +138,16 @@ public class UserGroupInformation {
|
|||
LOG.debug("using kerberos user:"+user);
|
||||
}
|
||||
}
|
||||
// if we don't have a kerberos user, use the OS user
|
||||
//If we don't have a kerberos user and security is disabled, check
|
||||
//if user is specified in the environment or properties
|
||||
if (!isSecurityEnabled() && (user == null)) {
|
||||
String envUser = System.getenv(HADOOP_USER_NAME);
|
||||
if (envUser == null) {
|
||||
envUser = System.getProperty(HADOOP_USER_NAME);
|
||||
}
|
||||
user = envUser == null ? null : new User(envUser);
|
||||
}
|
||||
// use the OS user
|
||||
if (user == null) {
|
||||
user = getCanonicalUser(OS_PRINCIPAL_CLASS);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
|
|
|
@ -134,20 +134,6 @@
|
|||
</description>
|
||||
</property>
|
||||
|
||||
<!--- logging properties -->
|
||||
|
||||
<property>
|
||||
<name>hadoop.logfile.size</name>
|
||||
<value>10000000</value>
|
||||
<description>The max size of each log file</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>hadoop.logfile.count</name>
|
||||
<value>10</value>
|
||||
<description>The max number of log files</description>
|
||||
</property>
|
||||
|
||||
<!-- i/o properties -->
|
||||
<property>
|
||||
<name>io.file.buffer.size</name>
|
||||
|
|
|
@ -451,6 +451,9 @@ public class TestConfiguration extends TestCase {
|
|||
appendProperty("test.bool3", " true ");
|
||||
appendProperty("test.bool4", " false ");
|
||||
appendProperty("test.bool5", "foo");
|
||||
appendProperty("test.bool6", "TRUE");
|
||||
appendProperty("test.bool7", "FALSE");
|
||||
appendProperty("test.bool8", "");
|
||||
endConfig();
|
||||
Path fileResource = new Path(CONFIG);
|
||||
conf.addResource(fileResource);
|
||||
|
@ -459,6 +462,9 @@ public class TestConfiguration extends TestCase {
|
|||
assertEquals(true, conf.getBoolean("test.bool3", false));
|
||||
assertEquals(false, conf.getBoolean("test.bool4", true));
|
||||
assertEquals(true, conf.getBoolean("test.bool5", true));
|
||||
assertEquals(true, conf.getBoolean("test.bool6", false));
|
||||
assertEquals(false, conf.getBoolean("test.bool7", true));
|
||||
assertEquals(false, conf.getBoolean("test.bool8", false));
|
||||
}
|
||||
|
||||
public void testFloatValues() throws IOException {
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with this
|
||||
* work for additional information regarding copyright ownership. The ASF
|
||||
* licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations under
|
||||
* the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.security;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestUserFromEnv {
|
||||
|
||||
@Test
|
||||
public void testUserFromEnvironment() throws IOException {
|
||||
System.setProperty(UserGroupInformation.HADOOP_USER_NAME, "randomUser");
|
||||
Assert.assertEquals("randomUser", UserGroupInformation.getLoginUser()
|
||||
.getUserName());
|
||||
}
|
||||
}
|
|
@ -610,11 +610,11 @@
|
|||
</comparator>
|
||||
<comparator>
|
||||
<type>RegexpComparator</type>
|
||||
<expected-output>^( |\t)*in the specified format. Format accepts filesize in blocks \(%b\), filename \(%n\),( )*</expected-output>
|
||||
<expected-output>^( |\t)*in the specified format. Format accepts filesize in blocks \(%b\), group name of owner\(%g\),( )*</expected-output>
|
||||
</comparator>
|
||||
<comparator>
|
||||
<type>RegexpComparator</type>
|
||||
<expected-output>^( |\t)*block size \(%o\), replication \(%r\), modification date \(%y, %Y\)( )*</expected-output>
|
||||
<expected-output>^( |\t)*filename \(%n\), block size \(%o\), replication \(%r\), user name of owner\(%u\), modification date \(%y, %Y\)( )*</expected-output>
|
||||
</comparator>
|
||||
</comparators>
|
||||
</test>
|
||||
|
|
|
@ -18,4 +18,4 @@
|
|||
|
||||
OK_RELEASEAUDIT_WARNINGS=0
|
||||
OK_FINDBUGS_WARNINGS=0
|
||||
OK_JAVADOC_WARNINGS=2
|
||||
OK_JAVADOC_WARNINGS=0
|
||||
|
|
|
@ -53,6 +53,11 @@
|
|||
<artifactId>mockito-all</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-annotations</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.sun.jersey</groupId>
|
||||
<artifactId>jersey-server</artifactId>
|
||||
|
|
|
@ -219,7 +219,7 @@ public class HttpFSServer {
|
|||
* operation is @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.GetOpValues#LISTSTATUS}
|
||||
* @param doAs user being impersonated, defualt value is none. It can be used
|
||||
* only if the current user is a HttpFSServer proxyuser.
|
||||
* @param override, default is true. Used only for
|
||||
* @param override default is true. Used only for
|
||||
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
|
||||
* @param blockSize block size to set, used only by
|
||||
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
|
||||
|
@ -419,7 +419,7 @@ public class HttpFSServer {
|
|||
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
|
||||
* @param group group to set, used only for
|
||||
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#SETOWNER} operations.
|
||||
* @param override, default is true. Used only for
|
||||
* @param override default is true. Used only for
|
||||
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
|
||||
* @param blockSize block size to set, used only by
|
||||
* @link org.apache.hadoop.fs.http.client.HttpFSFileSystem.PutOpValues#CREATE} operations.
|
||||
|
|
|
@ -201,6 +201,10 @@ Release 0.23.1 - UNRELEASED
|
|||
|
||||
HDFS-2817. Combine the two TestSafeMode test suites. (todd)
|
||||
|
||||
HDFS-2818. Fix a missing space issue in HDFS webapps' title tags. (Devaraj K via harsh)
|
||||
|
||||
HDFS-2397. Undeprecate SecondaryNameNode (eli)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
HDFS-2130. Switch default checksum to CRC32C. (todd)
|
||||
|
@ -215,6 +219,12 @@ Release 0.23.1 - UNRELEASED
|
|||
for a client on the same node as the block file. (Andrew Purtell,
|
||||
Suresh Srinivas and Jitendra Nath Pandey via szetszwo)
|
||||
|
||||
HDFS-2825. Add test hook to turn off the writer preferring its local
|
||||
DN. (todd)
|
||||
|
||||
HDFS-2826. Add test case for HDFS-1476 (safemode can initialize
|
||||
replication queues before exiting) (todd)
|
||||
|
||||
BUG FIXES
|
||||
|
||||
HDFS-2541. For a sufficiently large value of blocks, the DN Scanner
|
||||
|
@ -276,6 +286,15 @@ Release 0.23.1 - UNRELEASED
|
|||
HDFS-2816. Fix missing license header in httpfs findbugsExcludeFile.xml.
|
||||
(hitesh via tucu)
|
||||
|
||||
HDFS-2822. processMisReplicatedBlock incorrectly identifies
|
||||
under-construction blocks as under-replicated. (todd)
|
||||
|
||||
HDFS-442. dfsthroughput in test jar throws NPE (harsh)
|
||||
|
||||
HDFS-2836. HttpFSServer still has 2 javadoc warnings in trunk (revans2 via tucu)
|
||||
|
||||
HDFS-2837. mvn javadoc:javadoc not seeing LimitedPrivate class (revans2 via tucu)
|
||||
|
||||
Release 0.23.0 - 2011-11-01
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -112,17 +112,18 @@
|
|||
problems.
|
||||
</li>
|
||||
<li>
|
||||
Secondary NameNode (deprecated): performs periodic checkpoints of the
|
||||
Secondary NameNode: performs periodic checkpoints of the
|
||||
namespace and helps keep the size of file containing log of HDFS
|
||||
modifications within certain limits at the NameNode.
|
||||
Replaced by Checkpoint node.
|
||||
</li>
|
||||
|
||||
<li>
|
||||
Checkpoint node: performs periodic checkpoints of the namespace and
|
||||
helps minimize the size of the log stored at the NameNode
|
||||
containing changes to the HDFS.
|
||||
Replaces the role previously filled by the Secondary NameNode.
|
||||
NameNode allows multiple Checkpoint nodes simultaneously,
|
||||
Replaces the role previously filled by the Secondary NameNode,
|
||||
though is not yet battle hardened.
|
||||
The NameNode allows multiple Checkpoint nodes simultaneously,
|
||||
as long as there are no Backup nodes registered with the system.
|
||||
</li>
|
||||
<li>
|
||||
|
@ -132,6 +133,7 @@
|
|||
which is always in sync with the active NameNode namespace state.
|
||||
Only one Backup node may be registered with the NameNode at once.
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
@ -234,12 +236,6 @@
|
|||
|
||||
</section>
|
||||
<section> <title>Secondary NameNode</title>
|
||||
<note>
|
||||
The Secondary NameNode has been deprecated.
|
||||
Instead, consider using the
|
||||
<a href="hdfs_user_guide.html#Checkpoint+node">Checkpoint Node</a> or
|
||||
<a href="hdfs_user_guide.html#Backup+node">Backup Node</a>.
|
||||
</note>
|
||||
<p>
|
||||
The NameNode stores modifications to the file system as a log
|
||||
appended to a native file system file, <code>edits</code>.
|
||||
|
@ -287,7 +283,9 @@
|
|||
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#secondarynamenode">secondarynamenode</a>.
|
||||
</p>
|
||||
|
||||
</section><section> <title> Checkpoint Node </title>
|
||||
</section>
|
||||
|
||||
<section> <title> Checkpoint Node </title>
|
||||
<p>NameNode persists its namespace using two files: <code>fsimage</code>,
|
||||
which is the latest checkpoint of the namespace and <code>edits</code>,
|
||||
a journal (log) of changes to the namespace since the checkpoint.
|
||||
|
|
|
@ -1793,7 +1793,8 @@ public class BlockManager {
|
|||
public void processMisReplicatedBlocks() {
|
||||
assert namesystem.hasWriteLock();
|
||||
|
||||
long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0;
|
||||
long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0,
|
||||
nrUnderConstruction = 0;
|
||||
neededReplications.clear();
|
||||
for (BlockInfo block : blocksMap.getBlocks()) {
|
||||
INodeFile fileINode = block.getINode();
|
||||
|
@ -1803,6 +1804,12 @@ public class BlockManager {
|
|||
addToInvalidates(block);
|
||||
continue;
|
||||
}
|
||||
if (!block.isComplete()) {
|
||||
// Incomplete blocks are never considered mis-replicated --
|
||||
// they'll be reached when they are completed or recovered.
|
||||
nrUnderConstruction++;
|
||||
continue;
|
||||
}
|
||||
// calculate current replication
|
||||
short expectedReplication = fileINode.getReplication();
|
||||
NumberReplicas num = countNodes(block);
|
||||
|
@ -1826,6 +1833,7 @@ public class BlockManager {
|
|||
LOG.info("Number of invalid blocks = " + nrInvalid);
|
||||
LOG.info("Number of under-replicated blocks = " + nrUnderReplicated);
|
||||
LOG.info("Number of over-replicated blocks = " + nrOverReplicated);
|
||||
LOG.info("Number of blocks being written = " + nrUnderConstruction);
|
||||
}
|
||||
|
||||
/** Set replication for the blocks. */
|
||||
|
|
|
@ -38,6 +38,8 @@ import org.apache.hadoop.net.NetworkTopology;
|
|||
import org.apache.hadoop.net.Node;
|
||||
import org.apache.hadoop.net.NodeBase;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
/** The class is responsible for choosing the desired number of targets
|
||||
* for placing block replicas.
|
||||
* The replica placement strategy is that if the writer is on a datanode,
|
||||
|
@ -49,6 +51,7 @@ import org.apache.hadoop.net.NodeBase;
|
|||
@InterfaceAudience.Private
|
||||
public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||
private boolean considerLoad;
|
||||
private boolean preferLocalNode = true;
|
||||
private NetworkTopology clusterMap;
|
||||
private FSClusterStats stats;
|
||||
static final String enableDebugLogging = "For more information, please enable"
|
||||
|
@ -223,7 +226,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
|||
if (localMachine == null)
|
||||
return chooseRandom(NodeBase.ROOT, excludedNodes,
|
||||
blocksize, maxNodesPerRack, results);
|
||||
|
||||
if (preferLocalNode) {
|
||||
// otherwise try local machine first
|
||||
Node oldNode = excludedNodes.put(localMachine, localMachine);
|
||||
if (oldNode == null) { // was not in the excluded list
|
||||
|
@ -233,7 +236,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
|||
return localMachine;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// try a node on local rack
|
||||
return chooseLocalRack(localMachine, excludedNodes,
|
||||
blocksize, maxNodesPerRack, results);
|
||||
|
@ -568,5 +571,10 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
|||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
void setPreferLocalNode(boolean prefer) {
|
||||
this.preferLocalNode = prefer;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -172,6 +172,7 @@ import org.apache.hadoop.util.VersionInfo;
|
|||
import org.mortbay.util.ajax.JSON;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
/***************************************************
|
||||
* FSNamesystem does the actual bookkeeping work for the
|
||||
|
@ -2842,7 +2843,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
|||
/** Total number of blocks. */
|
||||
int blockTotal;
|
||||
/** Number of safe blocks. */
|
||||
private int blockSafe;
|
||||
int blockSafe;
|
||||
/** Number of blocks needed to satisfy safe mode threshold condition */
|
||||
private int blockThreshold;
|
||||
/** Number of blocks needed before populating replication queues */
|
||||
|
@ -2850,7 +2851,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
|||
/** time of the last status printout */
|
||||
private long lastStatusReport = 0;
|
||||
/** flag indicating whether replication queues have been initialized */
|
||||
private boolean initializedReplQueues = false;
|
||||
boolean initializedReplQueues = false;
|
||||
/** Was safemode entered automatically because available resources were low. */
|
||||
private boolean resourcesLow = false;
|
||||
|
||||
|
@ -2980,9 +2981,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
|||
*/
|
||||
private synchronized void initializeReplQueues() {
|
||||
LOG.info("initializing replication queues");
|
||||
if (isPopulatingReplQueues()) {
|
||||
LOG.warn("Replication queues already initialized.");
|
||||
}
|
||||
assert !isPopulatingReplQueues() : "Already initialized repl queues";
|
||||
long startTimeMisReplicatedScan = now();
|
||||
blockManager.processMisReplicatedBlocks();
|
||||
initializedReplQueues = true;
|
||||
|
@ -4412,4 +4411,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
|||
byte[] password) throws InvalidToken {
|
||||
getDelegationTokenSecretManager().verifyToken(identifier, password);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public SafeModeInfo getSafeModeInfoForTests() {
|
||||
return safeMode;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -87,7 +87,6 @@ import com.google.common.collect.ImmutableList;
|
|||
* primary NameNode.
|
||||
*
|
||||
**********************************************************/
|
||||
@Deprecated // use BackupNode with -checkpoint argument instead.
|
||||
@InterfaceAudience.Private
|
||||
public class SecondaryNameNode implements Runnable {
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
|
||||
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
|
||||
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
|
||||
<body>
|
||||
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
|
||||
<%=NamenodeJspHelper.getVersionTable(fsn)%>
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
<html>
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
|
||||
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
|
||||
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
|
||||
|
||||
<body>
|
||||
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
|
||||
|
|
|
@ -37,7 +37,7 @@ String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameN
|
|||
<html>
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
|
||||
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
|
||||
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
|
||||
|
||||
<body>
|
||||
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
|
||||
|
|
|
@ -193,6 +193,10 @@ public class BenchmarkThroughput extends Configured implements Tool {
|
|||
BUFFER_SIZE = conf.getInt("dfsthroughput.buffer.size", 4 * 1024);
|
||||
|
||||
String localDir = conf.get("mapred.temp.dir");
|
||||
if (localDir == null) {
|
||||
localDir = conf.get("hadoop.tmp.dir");
|
||||
conf.set("mapred.temp.dir", localDir);
|
||||
}
|
||||
dir = new LocalDirAllocator("mapred.temp.dir");
|
||||
|
||||
System.setProperty("test.build.data", localDir);
|
||||
|
|
|
@ -19,22 +19,37 @@
|
|||
package org.apache.hadoop.hdfs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import org.junit.Before;
|
||||
import org.junit.After;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* Tests to verify safe mode correctness.
|
||||
*/
|
||||
public class TestSafeMode {
|
||||
private static final Path TEST_PATH = new Path("/test");
|
||||
private static final int BLOCK_SIZE = 1024;
|
||||
Configuration conf;
|
||||
MiniDFSCluster cluster;
|
||||
FileSystem fs;
|
||||
|
@ -43,6 +58,7 @@ public class TestSafeMode {
|
|||
@Before
|
||||
public void startUp() throws IOException {
|
||||
conf = new HdfsConfiguration();
|
||||
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
|
||||
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
|
||||
cluster.waitActive();
|
||||
fs = cluster.getFileSystem();
|
||||
|
@ -83,7 +99,7 @@ public class TestSafeMode {
|
|||
|
||||
// create two files with one block each.
|
||||
DFSTestUtil.createFile(fs, file1, 1000, (short)1, 0);
|
||||
DFSTestUtil.createFile(fs, file2, 2000, (short)1, 0);
|
||||
DFSTestUtil.createFile(fs, file2, 1000, (short)1, 0);
|
||||
fs.close();
|
||||
cluster.shutdown();
|
||||
|
||||
|
@ -128,6 +144,106 @@ public class TestSafeMode {
|
|||
assertEquals("", status);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that the NN initializes its under-replicated blocks queue
|
||||
* before it is ready to exit safemode (HDFS-1476)
|
||||
*/
|
||||
@Test(timeout=45000)
|
||||
public void testInitializeReplQueuesEarly() throws Exception {
|
||||
// Spray the blocks around the cluster when we add DNs instead of
|
||||
// concentrating all blocks on the first node.
|
||||
BlockManagerTestUtil.setWritingPrefersLocalNode(
|
||||
cluster.getNamesystem().getBlockManager(), false);
|
||||
|
||||
cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null);
|
||||
cluster.waitActive();
|
||||
DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L);
|
||||
|
||||
|
||||
List<DataNodeProperties> dnprops = Lists.newLinkedList();
|
||||
dnprops.add(cluster.stopDataNode(0));
|
||||
dnprops.add(cluster.stopDataNode(0));
|
||||
dnprops.add(cluster.stopDataNode(0));
|
||||
|
||||
cluster.getConfiguration(0).setFloat(
|
||||
DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f);
|
||||
|
||||
cluster.restartNameNode();
|
||||
final NameNode nn = cluster.getNameNode();
|
||||
|
||||
String status = nn.getNamesystem().getSafemode();
|
||||
assertEquals("Safe mode is ON.The reported blocks 0 needs additional " +
|
||||
"15 blocks to reach the threshold 0.9990 of total blocks 15. " +
|
||||
"Safe mode will be turned off automatically.", status);
|
||||
assertFalse("Mis-replicated block queues should not be initialized " +
|
||||
"until threshold is crossed",
|
||||
NameNodeAdapter.safeModeInitializedReplQueues(nn));
|
||||
|
||||
cluster.restartDataNode(dnprops.remove(0));
|
||||
|
||||
// Wait for the block report from the restarted DN to come in.
|
||||
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||
@Override
|
||||
public Boolean get() {
|
||||
return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0;
|
||||
}
|
||||
}, 10, 10000);
|
||||
// SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks
|
||||
// call is still going on at this point - wait until it's done by grabbing
|
||||
// the lock.
|
||||
nn.getNamesystem().writeLock();
|
||||
nn.getNamesystem().writeUnlock();
|
||||
int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn);
|
||||
assertTrue("Expected first block report to make some but not all blocks " +
|
||||
"safe. Got: " + safe, safe >= 1 && safe < 15);
|
||||
BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager());
|
||||
|
||||
assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn));
|
||||
assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks());
|
||||
|
||||
cluster.restartDataNodes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that, when under-replicated blocks are processed at the end of
|
||||
* safe-mode, blocks currently under construction are not considered
|
||||
* under-construction or missing. Regression test for HDFS-2822.
|
||||
*/
|
||||
@Test
|
||||
public void testRbwBlocksNotConsideredUnderReplicated() throws IOException {
|
||||
List<FSDataOutputStream> stms = Lists.newArrayList();
|
||||
try {
|
||||
// Create some junk blocks so that the NN doesn't just immediately
|
||||
// exit safemode on restart.
|
||||
DFSTestUtil.createFile(fs, new Path("/junk-blocks"),
|
||||
BLOCK_SIZE*4, (short)1, 1L);
|
||||
// Create several files which are left open. It's important to
|
||||
// create several here, because otherwise the first iteration of the
|
||||
// replication monitor will pull them off the replication queue and
|
||||
// hide this bug from the test!
|
||||
for (int i = 0; i < 10; i++) {
|
||||
FSDataOutputStream stm = fs.create(
|
||||
new Path("/append-" + i), true, BLOCK_SIZE, (short) 1, BLOCK_SIZE);
|
||||
stms.add(stm);
|
||||
stm.write(1);
|
||||
stm.hflush();
|
||||
}
|
||||
|
||||
cluster.restartNameNode();
|
||||
FSNamesystem ns = cluster.getNameNode(0).getNamesystem();
|
||||
BlockManagerTestUtil.updateState(ns.getBlockManager());
|
||||
assertEquals(0, ns.getPendingReplicationBlocks());
|
||||
assertEquals(0, ns.getCorruptReplicaBlocks());
|
||||
assertEquals(0, ns.getMissingBlocksCount());
|
||||
|
||||
} finally {
|
||||
for (FSDataOutputStream stm : stms) {
|
||||
IOUtils.closeStream(stm);
|
||||
}
|
||||
cluster.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
public interface FSRun {
|
||||
public abstract void run(FileSystem fs) throws IOException;
|
||||
}
|
||||
|
|
|
@ -27,6 +27,8 @@ import org.apache.hadoop.hdfs.protocol.Block;
|
|||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||
import org.apache.hadoop.util.Daemon;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
public class BlockManagerTestUtil {
|
||||
public static void setNodeReplicationLimit(final BlockManager blockManager,
|
||||
final int limit) {
|
||||
|
@ -122,4 +124,17 @@ public class BlockManagerTestUtil {
|
|||
return blockManager.computeDatanodeWork();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Change whether the block placement policy will prefer the writer's
|
||||
* local Datanode or not.
|
||||
* @param prefer
|
||||
*/
|
||||
public static void setWritingPrefersLocalNode(
|
||||
BlockManager bm, boolean prefer) {
|
||||
BlockPlacementPolicy bpp = bm.getBlockPlacementPolicy();
|
||||
Preconditions.checkState(bpp instanceof BlockPlacementPolicyDefault,
|
||||
"Must use default policy, got %s", bpp.getClass());
|
||||
((BlockPlacementPolicyDefault)bpp).setPreferLocalNode(prefer);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
|||
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
|
||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.SafeModeInfo;
|
||||
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
|
||||
import org.apache.hadoop.ipc.Server;
|
||||
|
||||
|
@ -97,4 +98,28 @@ public class NameNodeAdapter {
|
|||
ns.readUnlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of blocks marked safe by safemode, or -1
|
||||
* if safemode is not running.
|
||||
*/
|
||||
public static int getSafeModeSafeBlocks(NameNode nn) {
|
||||
SafeModeInfo smi = nn.getNamesystem().getSafeModeInfoForTests();
|
||||
if (smi == null) {
|
||||
return -1;
|
||||
}
|
||||
return smi.blockSafe;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if safemode is not running, or if safemode has already
|
||||
* initialized the replication queues
|
||||
*/
|
||||
public static boolean safeModeInitializedReplQueues(NameNode nn) {
|
||||
SafeModeInfo smi = nn.getNamesystem().getSafeModeInfoForTests();
|
||||
if (smi == null) {
|
||||
return true;
|
||||
}
|
||||
return smi.initializedReplQueues;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -203,7 +203,6 @@ public class TestCheckpoint extends TestCase {
|
|||
/*
|
||||
* Simulate namenode crashing after rolling edit log.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSecondaryNamenodeError1()
|
||||
throws IOException {
|
||||
LOG.info("Starting testSecondaryNamenodeError1");
|
||||
|
@ -265,7 +264,6 @@ public class TestCheckpoint extends TestCase {
|
|||
/*
|
||||
* Simulate a namenode crash after uploading new image
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSecondaryNamenodeError2() throws IOException {
|
||||
LOG.info("Starting testSecondaryNamenodeError2");
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
@ -324,7 +322,6 @@ public class TestCheckpoint extends TestCase {
|
|||
/*
|
||||
* Simulate a secondary namenode crash after rolling the edit log.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSecondaryNamenodeError3() throws IOException {
|
||||
LOG.info("Starting testSecondaryNamenodeError3");
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
@ -394,7 +391,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* back to the name-node.
|
||||
* Used to truncate primary fsimage file.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSecondaryFailsToReturnImage() throws IOException {
|
||||
LOG.info("Starting testSecondaryFailsToReturnImage");
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
@ -471,7 +467,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* @param errorType the ErrorSimulator type to trigger
|
||||
* @param exceptionSubstring an expected substring of the triggered exception
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
private void doSendFailTest(int errorType, String exceptionSubstring)
|
||||
throws IOException {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
@ -586,7 +581,6 @@ public class TestCheckpoint extends TestCase {
|
|||
/**
|
||||
* Test that the SecondaryNameNode properly locks its storage directories.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSecondaryNameNodeLocking() throws Exception {
|
||||
// Start a primary NN so that the secondary will start successfully
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
@ -679,7 +673,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* 2. if the NN does not contain an image, importing a checkpoint
|
||||
* succeeds and re-saves the image
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testImportCheckpoint() throws Exception {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
Path testPath = new Path("/testfile");
|
||||
|
@ -760,16 +753,12 @@ public class TestCheckpoint extends TestCase {
|
|||
throw new IOException("Cannot create directory " + dir);
|
||||
}
|
||||
|
||||
// This deprecation suppress warning does not work due to known Java bug:
|
||||
// http://bugs.sun.com/view_bug.do?bug_id=6460147
|
||||
@SuppressWarnings("deprecation")
|
||||
SecondaryNameNode startSecondaryNameNode(Configuration conf
|
||||
) throws IOException {
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, "0.0.0.0:0");
|
||||
return new SecondaryNameNode(conf);
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
SecondaryNameNode startSecondaryNameNode(Configuration conf, int index)
|
||||
throws IOException {
|
||||
Configuration snnConf = new Configuration(conf);
|
||||
|
@ -782,7 +771,6 @@ public class TestCheckpoint extends TestCase {
|
|||
/**
|
||||
* Tests checkpoint in HDFS.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testCheckpoint() throws IOException {
|
||||
Path file1 = new Path("checkpoint.dat");
|
||||
Path file2 = new Path("checkpoint2.dat");
|
||||
|
@ -1009,7 +997,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* - it then fails again for the same reason
|
||||
* - it then tries to checkpoint a third time
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testCheckpointAfterTwoFailedUploads() throws IOException {
|
||||
MiniDFSCluster cluster = null;
|
||||
SecondaryNameNode secondary = null;
|
||||
|
@ -1064,7 +1051,6 @@ public class TestCheckpoint extends TestCase {
|
|||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testMultipleSecondaryNamenodes() throws IOException {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
String nameserviceId1 = "ns1";
|
||||
|
@ -1114,7 +1100,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* Test that the secondary doesn't have to re-download image
|
||||
* if it hasn't changed.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSecondaryImageDownload() throws IOException {
|
||||
LOG.info("Starting testSecondaryImageDownload");
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
@ -1197,7 +1182,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* It verifies that this works even though the earlier-txid checkpoint gets
|
||||
* uploaded after the later-txid checkpoint.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testMultipleSecondaryNNsAgainstSameNN() throws Exception {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
||||
|
@ -1283,7 +1267,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* It verifies that one of the two gets an error that it's uploading a
|
||||
* duplicate checkpoint, and the other one succeeds.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testMultipleSecondaryNNsAgainstSameNN2() throws Exception {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
|
||||
|
@ -1382,7 +1365,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* is running. The secondary should shut itself down if if talks to a NN
|
||||
* with the wrong namespace.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testReformatNNBetweenCheckpoints() throws IOException {
|
||||
MiniDFSCluster cluster = null;
|
||||
SecondaryNameNode secondary = null;
|
||||
|
@ -1637,7 +1619,6 @@ public class TestCheckpoint extends TestCase {
|
|||
/**
|
||||
* Test that the 2NN triggers a checkpoint after the configurable interval
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testCheckpointTriggerOnTxnCount() throws Exception {
|
||||
MiniDFSCluster cluster = null;
|
||||
SecondaryNameNode secondary = null;
|
||||
|
@ -1691,7 +1672,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* logs that connect the 2NN's old checkpoint to the current txid
|
||||
* get archived. Then, the 2NN tries to checkpoint again.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSecondaryHasVeryOutOfDateImage() throws IOException {
|
||||
MiniDFSCluster cluster = null;
|
||||
SecondaryNameNode secondary = null;
|
||||
|
@ -1729,7 +1709,6 @@ public class TestCheckpoint extends TestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testCommandLineParsing() throws ParseException {
|
||||
SecondaryNameNode.CommandLineOpts opts =
|
||||
new SecondaryNameNode.CommandLineOpts();
|
||||
|
@ -1764,7 +1743,6 @@ public class TestCheckpoint extends TestCase {
|
|||
} catch (ParseException e) {}
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
private void cleanup(SecondaryNameNode snn) {
|
||||
if (snn != null) {
|
||||
try {
|
||||
|
@ -1780,7 +1758,6 @@ public class TestCheckpoint extends TestCase {
|
|||
* Assert that if any two files have the same name across the 2NNs
|
||||
* and NN, they should have the same content too.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
private void assertParallelFilesInvariant(MiniDFSCluster cluster,
|
||||
ImmutableList<SecondaryNameNode> secondaries) throws Exception {
|
||||
List<File> allCurrentDirs = Lists.newArrayList();
|
||||
|
@ -1792,7 +1769,6 @@ public class TestCheckpoint extends TestCase {
|
|||
ImmutableSet.of("VERSION"));
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
private List<File> getCheckpointCurrentDirs(SecondaryNameNode secondary) {
|
||||
List<File> ret = Lists.newArrayList();
|
||||
for (URI u : secondary.getCheckpointDirs()) {
|
||||
|
@ -1802,7 +1778,6 @@ public class TestCheckpoint extends TestCase {
|
|||
return ret;
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
private CheckpointStorage spyOnSecondaryImage(SecondaryNameNode secondary1) {
|
||||
CheckpointStorage spy = Mockito.spy((CheckpointStorage)secondary1.getFSImage());;
|
||||
secondary1.setFSImage(spy);
|
||||
|
@ -1812,7 +1787,6 @@ public class TestCheckpoint extends TestCase {
|
|||
/**
|
||||
* A utility class to perform a checkpoint in a different thread.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
private static class DoCheckpointThread extends Thread {
|
||||
private final SecondaryNameNode snn;
|
||||
private volatile Throwable thrown = null;
|
||||
|
|
|
@ -106,9 +106,6 @@ public class TestNameEditsConfigs extends TestCase {
|
|||
assertTrue(!fileSys.exists(name));
|
||||
}
|
||||
|
||||
// This deprecation suppress warning does not work due to known Java bug:
|
||||
// http://bugs.sun.com/view_bug.do?bug_id=6460147
|
||||
@SuppressWarnings("deprecation")
|
||||
SecondaryNameNode startSecondaryNameNode(Configuration conf
|
||||
) throws IOException {
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, "0.0.0.0:0");
|
||||
|
@ -128,7 +125,6 @@ public class TestNameEditsConfigs extends TestCase {
|
|||
* sure we are reading proper edits and image.
|
||||
* @throws Exception
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testNameEditsConfigs() throws Exception {
|
||||
Path file1 = new Path("TestNameEditsConfigs1");
|
||||
Path file2 = new Path("TestNameEditsConfigs2");
|
||||
|
|
|
@ -30,7 +30,6 @@ import org.junit.Test;
|
|||
|
||||
public class TestSecondaryWebUi {
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Test
|
||||
public void testSecondaryWebUi() throws IOException {
|
||||
Configuration conf = new Configuration();
|
||||
|
|
|
@ -120,7 +120,6 @@ public class TestStartup extends TestCase {
|
|||
* start MiniDFScluster, create a file (to create edits) and do a checkpoint
|
||||
* @throws IOException
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void createCheckPoint() throws IOException {
|
||||
LOG.info("--starting mini cluster");
|
||||
// manage dirs parameter set to false
|
||||
|
@ -300,7 +299,6 @@ public class TestStartup extends TestCase {
|
|||
* secondary node copies fsimage and edits into correct separate directories.
|
||||
* @throws IOException
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
public void testSNNStartup() throws IOException{
|
||||
//setUpConfig();
|
||||
LOG.info("--starting SecondNN startup test");
|
||||
|
|
|
@ -153,7 +153,6 @@ public class TestStorageRestore {
|
|||
* 7. run doCheckpoint
|
||||
* 8. verify that all the image and edits files are the same.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
@Test
|
||||
public void testStorageRestore() throws Exception {
|
||||
int numDatanodes = 0;
|
||||
|
@ -310,7 +309,6 @@ public class TestStorageRestore {
|
|||
* then try to perform a checkpoint. The NN should not serve up the image or
|
||||
* edits from the restored (empty) dir.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
@Test
|
||||
public void testMultipleSecondaryCheckpoint() throws IOException {
|
||||
|
||||
|
|
|
@ -142,6 +142,14 @@ Release 0.23.1 - Unreleased
|
|||
|
||||
MAPREDUCE-3692. yarn-resourcemanager out and log files can get big. (eli)
|
||||
|
||||
MAPREDUCE-3710. Improved FileInputFormat to return better locality for the
|
||||
last split. (Siddarth Seth via vinodkv)
|
||||
|
||||
MAPREDUCE-2765. DistCp Rewrite. (Mithun Radhakrishnan via mahadev)
|
||||
|
||||
MAPREDUCE-3737. The Web Application Proxy's is not documented very well.
|
||||
(Robert Evans via mahadev)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
MAPREDUCE-3567. Extraneous JobConf objects in AM heap. (Vinod Kumar
|
||||
|
@ -165,7 +173,13 @@ Release 0.23.1 - Unreleased
|
|||
MAPREDUCE-3512. Batching JobHistory flushing to DFS so that we don't flush
|
||||
for every event slowing down AM. (Siddarth Seth via vinodkv)
|
||||
|
||||
MAPREDUCE-3718. Change default AM heartbeat interval to 1 second. (Hitesh
|
||||
Shah via sseth)
|
||||
|
||||
BUG FIXES
|
||||
MAPREDUCE-3194. "mapred mradmin" command is broken in mrv2
|
||||
(Jason Lowe via bobby)
|
||||
|
||||
MAPREDUCE-3462. Fix Gridmix JUnit testcase failures.
|
||||
(Ravi Prakash and Ravi Gummadi via amarrk)
|
||||
|
||||
|
@ -499,6 +513,48 @@ Release 0.23.1 - Unreleased
|
|||
MAPREDUCE-3705. ant build fails on 0.23 branch. (Thomas Graves via
|
||||
mahadev)
|
||||
|
||||
MAPREDUCE-3691. webservices add support to compress response.
|
||||
(Thomas Graves via mahadev)
|
||||
|
||||
MAPREDUCE-3702. internal server error trying access application master
|
||||
via proxy with filter enabled (Thomas Graves via mahadev)
|
||||
|
||||
MAPREDUCE-3646. Remove redundant URL info from "mapred job" output.
|
||||
(Jonathan Eagles via mahadev)
|
||||
|
||||
MAPREDUCE-3681. Fixed computation of queue's usedCapacity. (acmurthy)
|
||||
|
||||
MAPREDUCE-3505. yarn APPLICATION_CLASSPATH needs to be overridable.
|
||||
(ahmed via tucu)
|
||||
|
||||
MAPREDUCE-3714. Fixed EventFetcher and Fetcher threads to shut-down properly
|
||||
so that reducers don't hang in corner cases. (vinodkv)
|
||||
|
||||
MAPREDUCE-3712. The mapreduce tar does not contain the hadoop-mapreduce-client-
|
||||
jobclient-tests.jar. (mahadev)
|
||||
|
||||
MAPREDUCE-3717. JobClient test jar has missing files to run all the test programs.
|
||||
(mahadev)
|
||||
|
||||
MAPREDUCE-3630. Fixes a NullPointer exception while running TeraGen - if a
|
||||
map is asked to generate 0 records. (Mahadev Konar via sseth)
|
||||
|
||||
MAPREDUCE-3683. Fixed maxCapacity of queues to be product of parent
|
||||
maxCapacities. (acmurthy)
|
||||
|
||||
MAPREDUCE-3713. Fixed the way head-room is allocated to applications by
|
||||
CapacityScheduler so that it deducts current-usage per user and not
|
||||
per-application. (Arun C Murthy via vinodkv)
|
||||
|
||||
MAPREDUCE-3721. Fixed a race in shuffle which caused reduces to hang.
|
||||
(sseth via acmurthy)
|
||||
|
||||
MAPREDUCE-3733. Add Apache License Header to hadoop-distcp/pom.xml.
|
||||
(mahadev)
|
||||
|
||||
MAPREDUCE-3735. Add distcp jar to the distribution (tar).
|
||||
(mahadev)
|
||||
|
||||
Release 0.23.0 - 2011-11-01
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -30,9 +30,6 @@ fi
|
|||
function print_usage(){
|
||||
echo "Usage: mapred [--config confdir] COMMAND"
|
||||
echo " where COMMAND is one of:"
|
||||
echo " mradmin run a Map-Reduce admin client"
|
||||
echo " jobtracker run the MapReduce job Tracker node"
|
||||
echo " tasktracker run a MapReduce task Tracker node"
|
||||
echo " pipes run a Pipes job"
|
||||
echo " job manipulate MapReduce jobs"
|
||||
echo " queue get information regarding JobQueues"
|
||||
|
@ -51,16 +48,7 @@ fi
|
|||
COMMAND=$1
|
||||
shift
|
||||
|
||||
if [ "$COMMAND" = "mradmin" ] ; then
|
||||
CLASS=org.apache.hadoop.mapred.tools.MRAdmin
|
||||
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
||||
elif [ "$COMMAND" = "jobtracker" ] ; then
|
||||
CLASS=org.apache.hadoop.mapred.JobTracker
|
||||
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
|
||||
elif [ "$COMMAND" = "tasktracker" ] ; then
|
||||
CLASS=org.apache.hadoop.mapred.TaskTracker
|
||||
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
|
||||
elif [ "$COMMAND" = "job" ] ; then
|
||||
if [ "$COMMAND" = "job" ] ; then
|
||||
CLASS=org.apache.hadoop.mapred.JobClient
|
||||
elif [ "$COMMAND" = "queue" ] ; then
|
||||
CLASS=org.apache.hadoop.mapred.JobQueueClient
|
||||
|
@ -75,6 +63,13 @@ elif [ "$COMMAND" = "classpath" ] ; then
|
|||
elif [ "$COMMAND" = "groups" ] ; then
|
||||
CLASS=org.apache.hadoop.mapred.tools.GetGroups
|
||||
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
||||
elif [ "$COMMAND" = "mradmin" ] \
|
||||
|| [ "$COMMAND" = "jobtracker" ] \
|
||||
|| [ "$COMMAND" = "tasktracker" ] ; then
|
||||
echo "Sorry, the $COMMAND command is no longer supported."
|
||||
echo "You may find similar functionality with the \"yarn\" shell command."
|
||||
print_usage
|
||||
exit
|
||||
else
|
||||
echo $COMMAND - invalid command
|
||||
print_usage
|
||||
|
|
|
@ -522,13 +522,13 @@ public abstract class TaskAttemptImpl implements
|
|||
* a parent CLC and use it for all the containers, so this should go away
|
||||
* once the mr-generated-classpath stuff is gone.
|
||||
*/
|
||||
private static String getInitialClasspath() throws IOException {
|
||||
private static String getInitialClasspath(Configuration conf) throws IOException {
|
||||
synchronized (classpathLock) {
|
||||
if (initialClasspathFlag.get()) {
|
||||
return initialClasspath;
|
||||
}
|
||||
Map<String, String> env = new HashMap<String, String>();
|
||||
MRApps.setClasspath(env);
|
||||
MRApps.setClasspath(env, conf);
|
||||
initialClasspath = env.get(Environment.CLASSPATH.name());
|
||||
initialClasspathFlag.set(true);
|
||||
return initialClasspath;
|
||||
|
@ -631,7 +631,7 @@ public abstract class TaskAttemptImpl implements
|
|||
Apps.addToEnvironment(
|
||||
environment,
|
||||
Environment.CLASSPATH.name(),
|
||||
getInitialClasspath());
|
||||
getInitialClasspath(conf));
|
||||
} catch (IOException e) {
|
||||
throw new YarnException(e);
|
||||
}
|
||||
|
|
|
@ -38,6 +38,10 @@
|
|||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-yarn-server-common</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
|
|
@ -54,6 +54,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
|
|||
import org.apache.hadoop.yarn.api.records.LocalResource;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceType;
|
||||
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.util.Apps;
|
||||
import org.apache.hadoop.yarn.util.BuilderUtils;
|
||||
|
@ -171,7 +172,7 @@ public class MRApps extends Apps {
|
|||
}
|
||||
|
||||
private static void setMRFrameworkClasspath(
|
||||
Map<String, String> environment) throws IOException {
|
||||
Map<String, String> environment, Configuration conf) throws IOException {
|
||||
InputStream classpathFileStream = null;
|
||||
BufferedReader reader = null;
|
||||
try {
|
||||
|
@ -208,8 +209,10 @@ public class MRApps extends Apps {
|
|||
}
|
||||
|
||||
// Add standard Hadoop classes
|
||||
for (String c : ApplicationConstants.APPLICATION_CLASSPATH) {
|
||||
Apps.addToEnvironment(environment, Environment.CLASSPATH.name(), c);
|
||||
for (String c : conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH)
|
||||
.split(",")) {
|
||||
Apps.addToEnvironment(environment, Environment.CLASSPATH.name(), c
|
||||
.trim());
|
||||
}
|
||||
} finally {
|
||||
if (classpathFileStream != null) {
|
||||
|
@ -222,8 +225,8 @@ public class MRApps extends Apps {
|
|||
// TODO: Remove duplicates.
|
||||
}
|
||||
|
||||
public static void setClasspath(Map<String, String> environment)
|
||||
throws IOException {
|
||||
public static void setClasspath(Map<String, String> environment,
|
||||
Configuration conf) throws IOException {
|
||||
Apps.addToEnvironment(
|
||||
environment,
|
||||
Environment.CLASSPATH.name(),
|
||||
|
@ -232,7 +235,7 @@ public class MRApps extends Apps {
|
|||
environment,
|
||||
Environment.CLASSPATH.name(),
|
||||
Environment.PWD.$() + Path.SEPARATOR + "*");
|
||||
MRApps.setMRFrameworkClasspath(environment);
|
||||
MRApps.setMRFrameworkClasspath(environment, conf);
|
||||
}
|
||||
|
||||
private static final String STAGING_CONSTANT = ".staging";
|
||||
|
|
|
@ -18,7 +18,12 @@
|
|||
|
||||
package org.apache.hadoop.mapreduce.v2.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.JobID;
|
||||
import org.apache.hadoop.mapreduce.MRJobConfig;
|
||||
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
||||
|
@ -121,4 +126,17 @@ public class TestMRApps {
|
|||
"/my/path/to/staging/dummy-user/.staging/job_dummy-job_12345/job.xml", jobFile);
|
||||
}
|
||||
|
||||
@Test public void testSetClasspath() throws IOException {
|
||||
Job job = Job.getInstance();
|
||||
Map<String, String> environment = new HashMap<String, String>();
|
||||
MRApps.setClasspath(environment, job.getConfiguration());
|
||||
assertEquals("job.jar:$PWD/*:$HADOOP_CONF_DIR:" +
|
||||
"$HADOOP_COMMON_HOME/share/hadoop/common/*:" +
|
||||
"$HADOOP_COMMON_HOME/share/hadoop/common/lib/*:" +
|
||||
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/*:" +
|
||||
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*:" +
|
||||
"$YARN_HOME/share/hadoop/mapreduce/*:" +
|
||||
"$YARN_HOME/share/hadoop/mapreduce/lib/*",
|
||||
environment.get("CLASSPATH"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -289,8 +289,10 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
|
|||
}
|
||||
|
||||
if (bytesRemaining != 0) {
|
||||
splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
|
||||
blkLocations[blkLocations.length-1].getHosts()));
|
||||
String[] splitHosts = getSplitHosts(blkLocations, length
|
||||
- bytesRemaining, bytesRemaining, clusterMap);
|
||||
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
|
||||
splitHosts));
|
||||
}
|
||||
} else if (length != 0) {
|
||||
String[] splitHosts = getSplitHosts(blkLocations,0,length,clusterMap);
|
||||
|
|
|
@ -1216,6 +1216,7 @@ public class Job extends JobContextImpl implements JobContext {
|
|||
}
|
||||
});
|
||||
state = JobState.RUNNING;
|
||||
LOG.info("The url to track the job: " + getTrackingURL());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -417,7 +417,7 @@ public interface MRJobConfig {
|
|||
/** How often the AM should send heartbeats to the RM.*/
|
||||
public static final String MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS =
|
||||
MR_AM_PREFIX + "scheduler.heartbeat.interval-ms";
|
||||
public static final int DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = 2000;
|
||||
public static final int DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = 1000;
|
||||
|
||||
/**
|
||||
* If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS
|
||||
|
|
|
@ -286,8 +286,9 @@ public abstract class FileInputFormat<K, V> extends InputFormat<K, V> {
|
|||
}
|
||||
|
||||
if (bytesRemaining != 0) {
|
||||
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
|
||||
splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
|
||||
blkLocations[blkLocations.length-1].getHosts()));
|
||||
blkLocations[blkIndex].getHosts()));
|
||||
}
|
||||
} else { // not splitable
|
||||
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.hadoop.mapred.TaskCompletionEvent;
|
|||
import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
class EventFetcher<K,V> extends Thread {
|
||||
private static final long SLEEP_TIME = 1000;
|
||||
private static final int MAX_EVENTS_TO_FETCH = 10000;
|
||||
|
@ -42,6 +43,8 @@ class EventFetcher<K,V> extends Thread {
|
|||
|
||||
private int maxMapRuntime = 0;
|
||||
|
||||
private volatile boolean stopped = false;
|
||||
|
||||
public EventFetcher(TaskAttemptID reduce,
|
||||
TaskUmbilicalProtocol umbilical,
|
||||
ShuffleScheduler<K,V> scheduler,
|
||||
|
@ -60,7 +63,7 @@ class EventFetcher<K,V> extends Thread {
|
|||
LOG.info(reduce + " Thread started: " + getName());
|
||||
|
||||
try {
|
||||
while (true && !Thread.currentThread().isInterrupted()) {
|
||||
while (!stopped && !Thread.currentThread().isInterrupted()) {
|
||||
try {
|
||||
int numNewMaps = getMapCompletionEvents();
|
||||
failures = 0;
|
||||
|
@ -71,6 +74,9 @@ class EventFetcher<K,V> extends Thread {
|
|||
if (!Thread.currentThread().isInterrupted()) {
|
||||
Thread.sleep(SLEEP_TIME);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
LOG.info("EventFetcher is interrupted.. Returning");
|
||||
return;
|
||||
} catch (IOException ie) {
|
||||
LOG.info("Exception in getting events", ie);
|
||||
// check to see whether to abort
|
||||
|
@ -91,6 +97,16 @@ class EventFetcher<K,V> extends Thread {
|
|||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
this.stopped = true;
|
||||
interrupt();
|
||||
try {
|
||||
join(5000);
|
||||
} catch(InterruptedException ie) {
|
||||
LOG.warn("Got interrupted while joining " + getName(), ie);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Queries the {@link TaskTracker} for a set of map-completion events
|
||||
* from a given event ID.
|
||||
|
|
|
@ -48,6 +48,7 @@ import org.apache.hadoop.mapreduce.task.reduce.MapOutput.Type;
|
|||
import org.apache.hadoop.util.Progressable;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
|
||||
@SuppressWarnings({"deprecation"})
|
||||
class Fetcher<K,V> extends Thread {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(Fetcher.class);
|
||||
|
@ -88,6 +89,8 @@ class Fetcher<K,V> extends Thread {
|
|||
private final Decompressor decompressor;
|
||||
private final SecretKey jobTokenSecret;
|
||||
|
||||
private volatile boolean stopped = false;
|
||||
|
||||
public Fetcher(JobConf job, TaskAttemptID reduceId,
|
||||
ShuffleScheduler<K,V> scheduler, MergeManager<K,V> merger,
|
||||
Reporter reporter, ShuffleClientMetrics metrics,
|
||||
|
@ -135,7 +138,7 @@ class Fetcher<K,V> extends Thread {
|
|||
|
||||
public void run() {
|
||||
try {
|
||||
while (true && !Thread.currentThread().isInterrupted()) {
|
||||
while (!stopped && !Thread.currentThread().isInterrupted()) {
|
||||
MapHost host = null;
|
||||
try {
|
||||
// If merge is on, block
|
||||
|
@ -161,6 +164,16 @@ class Fetcher<K,V> extends Thread {
|
|||
}
|
||||
}
|
||||
|
||||
public void shutDown() throws InterruptedException {
|
||||
this.stopped = true;
|
||||
interrupt();
|
||||
try {
|
||||
join(5000);
|
||||
} catch (InterruptedException ie) {
|
||||
LOG.warn("Got interrupt while joining " + getName(), ie);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The crux of the matter...
|
||||
*
|
||||
|
|
|
@ -92,6 +92,7 @@ public class MergeManager<K, V> {
|
|||
|
||||
private final long memoryLimit;
|
||||
private long usedMemory;
|
||||
private long commitMemory;
|
||||
private final long maxSingleShuffleLimit;
|
||||
|
||||
private final int memToMemMergeOutputsThreshold;
|
||||
|
@ -181,6 +182,13 @@ public class MergeManager<K, V> {
|
|||
"ioSortFactor=" + ioSortFactor + ", " +
|
||||
"memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold);
|
||||
|
||||
if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
|
||||
throw new RuntimeException("Invlaid configuration: "
|
||||
+ "maxSingleShuffleLimit should be less than mergeThreshold"
|
||||
+ "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit
|
||||
+ "mergeThreshold: " + this.mergeThreshold);
|
||||
}
|
||||
|
||||
boolean allowMemToMemMerge =
|
||||
jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
|
||||
if (allowMemToMemMerge) {
|
||||
|
@ -245,16 +253,16 @@ public class MergeManager<K, V> {
|
|||
// all the stalled threads
|
||||
|
||||
if (usedMemory > memoryLimit) {
|
||||
LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory +
|
||||
") is greater than memoryLimit (" + memoryLimit + ")");
|
||||
|
||||
LOG.debug(mapId + ": Stalling shuffle since usedMemory (" + usedMemory
|
||||
+ ") is greater than memoryLimit (" + memoryLimit + ")." +
|
||||
" CommitMemory is (" + commitMemory + ")");
|
||||
return stallShuffle;
|
||||
}
|
||||
|
||||
// Allow the in-memory shuffle to progress
|
||||
LOG.debug(mapId + ": Proceeding with shuffle since usedMemory (" +
|
||||
usedMemory +
|
||||
") is lesser than memoryLimit (" + memoryLimit + ")");
|
||||
LOG.debug(mapId + ": Proceeding with shuffle since usedMemory ("
|
||||
+ usedMemory + ") is lesser than memoryLimit (" + memoryLimit + ")."
|
||||
+ "CommitMemory is (" + commitMemory + ")");
|
||||
return unconditionalReserve(mapId, requestedSize, true);
|
||||
}
|
||||
|
||||
|
@ -270,18 +278,24 @@ public class MergeManager<K, V> {
|
|||
}
|
||||
|
||||
synchronized void unreserve(long size) {
|
||||
commitMemory -= size;
|
||||
usedMemory -= size;
|
||||
}
|
||||
|
||||
public synchronized void closeInMemoryFile(MapOutput<K,V> mapOutput) {
|
||||
inMemoryMapOutputs.add(mapOutput);
|
||||
LOG.info("closeInMemoryFile -> map-output of size: " + mapOutput.getSize()
|
||||
+ ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size());
|
||||
+ ", inMemoryMapOutputs.size() -> " + inMemoryMapOutputs.size()
|
||||
+ ", commitMemory -> " + commitMemory + ", usedMemory ->" + usedMemory);
|
||||
|
||||
commitMemory+= mapOutput.getSize();
|
||||
|
||||
synchronized (inMemoryMerger) {
|
||||
if (!inMemoryMerger.isInProgress() && usedMemory >= mergeThreshold) {
|
||||
LOG.info("Starting inMemoryMerger's merge since usedMemory=" +
|
||||
usedMemory + " > mergeThreshold=" + mergeThreshold);
|
||||
// Can hang if mergeThreshold is really low.
|
||||
if (!inMemoryMerger.isInProgress() && commitMemory >= mergeThreshold) {
|
||||
LOG.info("Starting inMemoryMerger's merge since commitMemory=" +
|
||||
commitMemory + " > mergeThreshold=" + mergeThreshold +
|
||||
". Current usedMemory=" + usedMemory);
|
||||
inMemoryMapOutputs.addAll(inMemoryMergedMapOutputs);
|
||||
inMemoryMergedMapOutputs.clear();
|
||||
inMemoryMerger.startMerge(inMemoryMapOutputs);
|
||||
|
|
|
@ -19,8 +19,6 @@ package org.apache.hadoop.mapreduce.task.reduce;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
@ -33,17 +31,17 @@ import org.apache.hadoop.mapred.RawKeyValueIterator;
|
|||
import org.apache.hadoop.mapred.Reducer;
|
||||
import org.apache.hadoop.mapred.Reporter;
|
||||
import org.apache.hadoop.mapred.Task;
|
||||
import org.apache.hadoop.mapred.Task.CombineOutputCollector;
|
||||
import org.apache.hadoop.mapred.TaskStatus;
|
||||
import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
|
||||
import org.apache.hadoop.mapred.Task.CombineOutputCollector;
|
||||
import org.apache.hadoop.mapreduce.MRJobConfig;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
||||
import org.apache.hadoop.util.Progress;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Unstable
|
||||
@SuppressWarnings({"deprecation", "unchecked", "rawtypes"})
|
||||
public class Shuffle<K, V> implements ExceptionReporter {
|
||||
private static final Log LOG = LogFactory.getLog(Shuffle.class);
|
||||
private static final int PROGRESS_FREQUENCY = 2000;
|
||||
|
||||
private final TaskAttemptID reduceId;
|
||||
|
@ -100,7 +98,6 @@ public class Shuffle<K, V> implements ExceptionReporter {
|
|||
this, mergePhase, mapOutputFile);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public RawKeyValueIterator run() throws IOException, InterruptedException {
|
||||
// Start the map-completion events fetcher thread
|
||||
final EventFetcher<K,V> eventFetcher =
|
||||
|
@ -130,19 +127,11 @@ public class Shuffle<K, V> implements ExceptionReporter {
|
|||
}
|
||||
|
||||
// Stop the event-fetcher thread
|
||||
eventFetcher.interrupt();
|
||||
try {
|
||||
eventFetcher.join();
|
||||
} catch(Throwable t) {
|
||||
LOG.info("Failed to stop " + eventFetcher.getName(), t);
|
||||
}
|
||||
eventFetcher.shutDown();
|
||||
|
||||
// Stop the map-output fetcher threads
|
||||
for (Fetcher<K,V> fetcher : fetchers) {
|
||||
fetcher.interrupt();
|
||||
}
|
||||
for (Fetcher<K,V> fetcher : fetchers) {
|
||||
fetcher.join();
|
||||
fetcher.shutDown();
|
||||
}
|
||||
fetchers = null;
|
||||
|
||||
|
|
|
@ -102,6 +102,13 @@
|
|||
<phase>test-compile</phase>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<mainClass>org.apache.hadoop.test.MapredTestDriver</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
|
|
|
@ -175,7 +175,6 @@ public class ClientServiceDelegate {
|
|||
+ ":" + addr.getPort()));
|
||||
newUgi.addToken(clientToken);
|
||||
}
|
||||
LOG.info("The url to track the job: " + application.getTrackingUrl());
|
||||
LOG.debug("Connecting to " + serviceAddr);
|
||||
final String tempStr = serviceAddr;
|
||||
realProxy = newUgi.doAs(new PrivilegedExceptionAction<MRClientProtocol>() {
|
||||
|
|
|
@ -406,7 +406,7 @@ public class YARNRunner implements ClientProtocol {
|
|||
// Setup the CLASSPATH in environment
|
||||
// i.e. add { job jar, CWD, Hadoop jars} to classpath.
|
||||
Map<String, String> environment = new HashMap<String, String>();
|
||||
MRApps.setClasspath(environment);
|
||||
MRApps.setClasspath(environment, conf);
|
||||
|
||||
// Parse distributed cache
|
||||
MRApps.setupDistributedCache(jobConf, localResources);
|
||||
|
|
|
@ -29,7 +29,6 @@ import java.util.Stack;
|
|||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.examples.RandomTextWriter;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
@ -40,6 +39,7 @@ import org.apache.hadoop.io.Writable;
|
|||
import org.apache.hadoop.io.WritableComparable;
|
||||
import org.apache.hadoop.io.WritableUtils;
|
||||
import org.apache.hadoop.mapred.lib.NullOutputFormat;
|
||||
import org.apache.hadoop.mapreduce.RandomTextWriter;
|
||||
import org.apache.hadoop.util.GenericOptionsParser;
|
||||
import org.apache.hadoop.util.ReflectionUtils;
|
||||
import org.apache.hadoop.util.Tool;
|
|
@ -17,6 +17,10 @@
|
|||
*/
|
||||
package org.apache.hadoop.mapred;
|
||||
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -32,6 +36,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
|
|||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
public class TestFileInputFormat extends TestCase {
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
|
@ -186,6 +191,102 @@ public class TestFileInputFormat extends TestCase {
|
|||
assertEquals(splits.length, 2);
|
||||
}
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
public void testLastInputSplitAtSplitBoundary() throws Exception {
|
||||
FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024,
|
||||
128l * 1024 * 1024);
|
||||
JobConf job = new JobConf();
|
||||
InputSplit[] splits = fif.getSplits(job, 8);
|
||||
assertEquals(8, splits.length);
|
||||
for (int i = 0; i < splits.length; i++) {
|
||||
InputSplit split = splits[i];
|
||||
assertEquals(("host" + i), split.getLocations()[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
public void testLastInputSplitExceedingSplitBoundary() throws Exception {
|
||||
FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024,
|
||||
128l * 1024 * 1024);
|
||||
JobConf job = new JobConf();
|
||||
InputSplit[] splits = fif.getSplits(job, 8);
|
||||
assertEquals(8, splits.length);
|
||||
for (int i = 0; i < splits.length; i++) {
|
||||
InputSplit split = splits[i];
|
||||
assertEquals(("host" + i), split.getLocations()[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
public void testLastInputSplitSingleSplit() throws Exception {
|
||||
FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024,
|
||||
128l * 1024 * 1024);
|
||||
JobConf job = new JobConf();
|
||||
InputSplit[] splits = fif.getSplits(job, 1);
|
||||
assertEquals(1, splits.length);
|
||||
for (int i = 0; i < splits.length; i++) {
|
||||
InputSplit split = splits[i];
|
||||
assertEquals(("host" + i), split.getLocations()[0]);
|
||||
}
|
||||
}
|
||||
|
||||
private class FileInputFormatForTest<K, V> extends FileInputFormat<K, V> {
|
||||
|
||||
long splitSize;
|
||||
long length;
|
||||
|
||||
FileInputFormatForTest(long length, long splitSize) {
|
||||
this.length = length;
|
||||
this.splitSize = splitSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job,
|
||||
Reporter reporter) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected FileStatus[] listStatus(JobConf job) throws IOException {
|
||||
FileStatus mockFileStatus = mock(FileStatus.class);
|
||||
when(mockFileStatus.getBlockSize()).thenReturn(splitSize);
|
||||
when(mockFileStatus.isDirectory()).thenReturn(false);
|
||||
Path mockPath = mock(Path.class);
|
||||
FileSystem mockFs = mock(FileSystem.class);
|
||||
|
||||
BlockLocation[] blockLocations = mockBlockLocations(length, splitSize);
|
||||
when(mockFs.getFileBlockLocations(mockFileStatus, 0, length)).thenReturn(
|
||||
blockLocations);
|
||||
when(mockPath.getFileSystem(any(Configuration.class))).thenReturn(mockFs);
|
||||
|
||||
when(mockFileStatus.getPath()).thenReturn(mockPath);
|
||||
when(mockFileStatus.getLen()).thenReturn(length);
|
||||
|
||||
FileStatus[] fs = new FileStatus[1];
|
||||
fs[0] = mockFileStatus;
|
||||
return fs;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
|
||||
return splitSize;
|
||||
}
|
||||
|
||||
private BlockLocation[] mockBlockLocations(long size, long splitSize) {
|
||||
int numLocations = (int) (size / splitSize);
|
||||
if (size % splitSize != 0)
|
||||
numLocations++;
|
||||
BlockLocation[] blockLocations = new BlockLocation[numLocations];
|
||||
for (int i = 0; i < numLocations; i++) {
|
||||
String[] names = new String[] { "b" + i };
|
||||
String[] hosts = new String[] { "host" + i };
|
||||
blockLocations[i] = new BlockLocation(names, hosts, i * splitSize,
|
||||
Math.min(splitSize, size - (splitSize * i)));
|
||||
}
|
||||
return blockLocations;
|
||||
}
|
||||
}
|
||||
|
||||
static void writeFile(Configuration conf, Path name,
|
||||
short replication, int numBlocks) throws IOException {
|
||||
FileSystem fileSys = FileSystem.get(conf);
|
||||
|
|
|
@ -25,7 +25,6 @@ import java.util.Random;
|
|||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.examples.RandomWriter;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.BytesWritable;
|
|
@ -29,7 +29,6 @@ import java.util.Stack;
|
|||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.examples.RandomTextWriter;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
|
@ -0,0 +1,757 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.mapreduce;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.ClusterStatus;
|
||||
import org.apache.hadoop.mapred.JobClient;
|
||||
import org.apache.hadoop.mapreduce.*;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
/**
|
||||
* This program uses map/reduce to just run a distributed job where there is
|
||||
* no interaction between the tasks and each task writes a large unsorted
|
||||
* random sequence of words.
|
||||
* In order for this program to generate data for terasort with a 5-10 words
|
||||
* per key and 20-100 words per value, have the following config:
|
||||
* <xmp>
|
||||
* <?xml version="1.0"?>
|
||||
* <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
* <configuration>
|
||||
* <property>
|
||||
* <name>mapreduce.randomtextwriter.minwordskey</name>
|
||||
* <value>5</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomtextwriter.maxwordskey</name>
|
||||
* <value>10</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomtextwriter.minwordsvalue</name>
|
||||
* <value>20</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomtextwriter.maxwordsvalue</name>
|
||||
* <value>100</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomtextwriter.totalbytes</name>
|
||||
* <value>1099511627776</value>
|
||||
* </property>
|
||||
* </configuration></xmp>
|
||||
*
|
||||
* Equivalently, {@link RandomTextWriter} also supports all the above options
|
||||
* and ones supported by {@link Tool} via the command-line.
|
||||
*
|
||||
* To run: bin/hadoop jar hadoop-${version}-examples.jar randomtextwriter
|
||||
* [-outFormat <i>output format class</i>] <i>output</i>
|
||||
*/
|
||||
public class RandomTextWriter extends Configured implements Tool {
|
||||
public static final String TOTAL_BYTES =
|
||||
"mapreduce.randomtextwriter.totalbytes";
|
||||
public static final String BYTES_PER_MAP =
|
||||
"mapreduce.randomtextwriter.bytespermap";
|
||||
public static final String MAPS_PER_HOST =
|
||||
"mapreduce.randomtextwriter.mapsperhost";
|
||||
public static final String MAX_VALUE = "mapreduce.randomtextwriter.maxwordsvalue";
|
||||
public static final String MIN_VALUE = "mapreduce.randomtextwriter.minwordsvalue";
|
||||
public static final String MIN_KEY = "mapreduce.randomtextwriter.minwordskey";
|
||||
public static final String MAX_KEY = "mapreduce.randomtextwriter.maxwordskey";
|
||||
|
||||
static int printUsage() {
|
||||
System.out.println("randomtextwriter " +
|
||||
"[-outFormat <output format class>] " +
|
||||
"<output>");
|
||||
ToolRunner.printGenericCommandUsage(System.out);
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* User counters
|
||||
*/
|
||||
static enum Counters { RECORDS_WRITTEN, BYTES_WRITTEN }
|
||||
|
||||
static class RandomTextMapper extends Mapper<Text, Text, Text, Text> {
|
||||
|
||||
private long numBytesToWrite;
|
||||
private int minWordsInKey;
|
||||
private int wordsInKeyRange;
|
||||
private int minWordsInValue;
|
||||
private int wordsInValueRange;
|
||||
private Random random = new Random();
|
||||
|
||||
/**
|
||||
* Save the configuration value that we need to write the data.
|
||||
*/
|
||||
public void setup(Context context) {
|
||||
Configuration conf = context.getConfiguration();
|
||||
numBytesToWrite = conf.getLong(BYTES_PER_MAP,
|
||||
1*1024*1024*1024);
|
||||
minWordsInKey = conf.getInt(MIN_KEY, 5);
|
||||
wordsInKeyRange = (conf.getInt(MAX_KEY, 10) - minWordsInKey);
|
||||
minWordsInValue = conf.getInt(MIN_VALUE, 10);
|
||||
wordsInValueRange = (conf.getInt(MAX_VALUE, 100) - minWordsInValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an output filename, write a bunch of random records to it.
|
||||
*/
|
||||
public void map(Text key, Text value,
|
||||
Context context) throws IOException,InterruptedException {
|
||||
int itemCount = 0;
|
||||
while (numBytesToWrite > 0) {
|
||||
// Generate the key/value
|
||||
int noWordsKey = minWordsInKey +
|
||||
(wordsInKeyRange != 0 ? random.nextInt(wordsInKeyRange) : 0);
|
||||
int noWordsValue = minWordsInValue +
|
||||
(wordsInValueRange != 0 ? random.nextInt(wordsInValueRange) : 0);
|
||||
Text keyWords = generateSentence(noWordsKey);
|
||||
Text valueWords = generateSentence(noWordsValue);
|
||||
|
||||
// Write the sentence
|
||||
context.write(keyWords, valueWords);
|
||||
|
||||
numBytesToWrite -= (keyWords.getLength() + valueWords.getLength());
|
||||
|
||||
// Update counters, progress etc.
|
||||
context.getCounter(Counters.BYTES_WRITTEN).increment(
|
||||
keyWords.getLength() + valueWords.getLength());
|
||||
context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
|
||||
if (++itemCount % 200 == 0) {
|
||||
context.setStatus("wrote record " + itemCount + ". " +
|
||||
numBytesToWrite + " bytes left.");
|
||||
}
|
||||
}
|
||||
context.setStatus("done with " + itemCount + " records.");
|
||||
}
|
||||
|
||||
private Text generateSentence(int noWords) {
|
||||
StringBuffer sentence = new StringBuffer();
|
||||
String space = " ";
|
||||
for (int i=0; i < noWords; ++i) {
|
||||
sentence.append(words[random.nextInt(words.length)]);
|
||||
sentence.append(space);
|
||||
}
|
||||
return new Text(sentence.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the main routine for launching a distributed random write job.
|
||||
* It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
|
||||
* The reduce doesn't do anything.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
public int run(String[] args) throws Exception {
|
||||
if (args.length == 0) {
|
||||
return printUsage();
|
||||
}
|
||||
|
||||
Configuration conf = getConf();
|
||||
JobClient client = new JobClient(conf);
|
||||
ClusterStatus cluster = client.getClusterStatus();
|
||||
int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
|
||||
long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
|
||||
1*1024*1024*1024);
|
||||
if (numBytesToWritePerMap == 0) {
|
||||
System.err.println("Cannot have " + BYTES_PER_MAP +" set to 0");
|
||||
return -2;
|
||||
}
|
||||
long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
|
||||
numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
|
||||
int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
|
||||
if (numMaps == 0 && totalBytesToWrite > 0) {
|
||||
numMaps = 1;
|
||||
conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
|
||||
}
|
||||
conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
|
||||
|
||||
Job job = new Job(conf);
|
||||
|
||||
job.setJarByClass(RandomTextWriter.class);
|
||||
job.setJobName("random-text-writer");
|
||||
|
||||
job.setOutputKeyClass(Text.class);
|
||||
job.setOutputValueClass(Text.class);
|
||||
|
||||
job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
|
||||
job.setMapperClass(RandomTextMapper.class);
|
||||
|
||||
Class<? extends OutputFormat> outputFormatClass =
|
||||
SequenceFileOutputFormat.class;
|
||||
List<String> otherArgs = new ArrayList<String>();
|
||||
for(int i=0; i < args.length; ++i) {
|
||||
try {
|
||||
if ("-outFormat".equals(args[i])) {
|
||||
outputFormatClass =
|
||||
Class.forName(args[++i]).asSubclass(OutputFormat.class);
|
||||
} else {
|
||||
otherArgs.add(args[i]);
|
||||
}
|
||||
} catch (ArrayIndexOutOfBoundsException except) {
|
||||
System.out.println("ERROR: Required parameter missing from " +
|
||||
args[i-1]);
|
||||
return printUsage(); // exits
|
||||
}
|
||||
}
|
||||
|
||||
job.setOutputFormatClass(outputFormatClass);
|
||||
FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
|
||||
|
||||
System.out.println("Running " + numMaps + " maps.");
|
||||
|
||||
// reducer NONE
|
||||
job.setNumReduceTasks(0);
|
||||
|
||||
Date startTime = new Date();
|
||||
System.out.println("Job started: " + startTime);
|
||||
int ret = job.waitForCompletion(true) ? 0 : 1;
|
||||
Date endTime = new Date();
|
||||
System.out.println("Job ended: " + endTime);
|
||||
System.out.println("The job took " +
|
||||
(endTime.getTime() - startTime.getTime()) /1000 +
|
||||
" seconds.");
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
int res = ToolRunner.run(new Configuration(), new RandomTextWriter(), args);
|
||||
System.exit(res);
|
||||
}
|
||||
|
||||
/**
|
||||
* A random list of 100 words from /usr/share/dict/words
|
||||
*/
|
||||
private static String[] words = {
|
||||
"diurnalness", "Homoiousian",
|
||||
"spiranthic", "tetragynian",
|
||||
"silverhead", "ungreat",
|
||||
"lithograph", "exploiter",
|
||||
"physiologian", "by",
|
||||
"hellbender", "Filipendula",
|
||||
"undeterring", "antiscolic",
|
||||
"pentagamist", "hypoid",
|
||||
"cacuminal", "sertularian",
|
||||
"schoolmasterism", "nonuple",
|
||||
"gallybeggar", "phytonic",
|
||||
"swearingly", "nebular",
|
||||
"Confervales", "thermochemically",
|
||||
"characinoid", "cocksuredom",
|
||||
"fallacious", "feasibleness",
|
||||
"debromination", "playfellowship",
|
||||
"tramplike", "testa",
|
||||
"participatingly", "unaccessible",
|
||||
"bromate", "experientialist",
|
||||
"roughcast", "docimastical",
|
||||
"choralcelo", "blightbird",
|
||||
"peptonate", "sombreroed",
|
||||
"unschematized", "antiabolitionist",
|
||||
"besagne", "mastication",
|
||||
"bromic", "sviatonosite",
|
||||
"cattimandoo", "metaphrastical",
|
||||
"endotheliomyoma", "hysterolysis",
|
||||
"unfulminated", "Hester",
|
||||
"oblongly", "blurredness",
|
||||
"authorling", "chasmy",
|
||||
"Scorpaenidae", "toxihaemia",
|
||||
"Dictograph", "Quakerishly",
|
||||
"deaf", "timbermonger",
|
||||
"strammel", "Thraupidae",
|
||||
"seditious", "plerome",
|
||||
"Arneb", "eristically",
|
||||
"serpentinic", "glaumrie",
|
||||
"socioromantic", "apocalypst",
|
||||
"tartrous", "Bassaris",
|
||||
"angiolymphoma", "horsefly",
|
||||
"kenno", "astronomize",
|
||||
"euphemious", "arsenide",
|
||||
"untongued", "parabolicness",
|
||||
"uvanite", "helpless",
|
||||
"gemmeous", "stormy",
|
||||
"templar", "erythrodextrin",
|
||||
"comism", "interfraternal",
|
||||
"preparative", "parastas",
|
||||
"frontoorbital", "Ophiosaurus",
|
||||
"diopside", "serosanguineous",
|
||||
"ununiformly", "karyological",
|
||||
"collegian", "allotropic",
|
||||
"depravity", "amylogenesis",
|
||||
"reformatory", "epidymides",
|
||||
"pleurotropous", "trillium",
|
||||
"dastardliness", "coadvice",
|
||||
"embryotic", "benthonic",
|
||||
"pomiferous", "figureheadship",
|
||||
"Megaluridae", "Harpa",
|
||||
"frenal", "commotion",
|
||||
"abthainry", "cobeliever",
|
||||
"manilla", "spiciferous",
|
||||
"nativeness", "obispo",
|
||||
"monilioid", "biopsic",
|
||||
"valvula", "enterostomy",
|
||||
"planosubulate", "pterostigma",
|
||||
"lifter", "triradiated",
|
||||
"venialness", "tum",
|
||||
"archistome", "tautness",
|
||||
"unswanlike", "antivenin",
|
||||
"Lentibulariaceae", "Triphora",
|
||||
"angiopathy", "anta",
|
||||
"Dawsonia", "becomma",
|
||||
"Yannigan", "winterproof",
|
||||
"antalgol", "harr",
|
||||
"underogating", "ineunt",
|
||||
"cornberry", "flippantness",
|
||||
"scyphostoma", "approbation",
|
||||
"Ghent", "Macraucheniidae",
|
||||
"scabbiness", "unanatomized",
|
||||
"photoelasticity", "eurythermal",
|
||||
"enation", "prepavement",
|
||||
"flushgate", "subsequentially",
|
||||
"Edo", "antihero",
|
||||
"Isokontae", "unforkedness",
|
||||
"porriginous", "daytime",
|
||||
"nonexecutive", "trisilicic",
|
||||
"morphiomania", "paranephros",
|
||||
"botchedly", "impugnation",
|
||||
"Dodecatheon", "obolus",
|
||||
"unburnt", "provedore",
|
||||
"Aktistetae", "superindifference",
|
||||
"Alethea", "Joachimite",
|
||||
"cyanophilous", "chorograph",
|
||||
"brooky", "figured",
|
||||
"periclitation", "quintette",
|
||||
"hondo", "ornithodelphous",
|
||||
"unefficient", "pondside",
|
||||
"bogydom", "laurinoxylon",
|
||||
"Shiah", "unharmed",
|
||||
"cartful", "noncrystallized",
|
||||
"abusiveness", "cromlech",
|
||||
"japanned", "rizzomed",
|
||||
"underskin", "adscendent",
|
||||
"allectory", "gelatinousness",
|
||||
"volcano", "uncompromisingly",
|
||||
"cubit", "idiotize",
|
||||
"unfurbelowed", "undinted",
|
||||
"magnetooptics", "Savitar",
|
||||
"diwata", "ramosopalmate",
|
||||
"Pishquow", "tomorn",
|
||||
"apopenptic", "Haversian",
|
||||
"Hysterocarpus", "ten",
|
||||
"outhue", "Bertat",
|
||||
"mechanist", "asparaginic",
|
||||
"velaric", "tonsure",
|
||||
"bubble", "Pyrales",
|
||||
"regardful", "glyphography",
|
||||
"calabazilla", "shellworker",
|
||||
"stradametrical", "havoc",
|
||||
"theologicopolitical", "sawdust",
|
||||
"diatomaceous", "jajman",
|
||||
"temporomastoid", "Serrifera",
|
||||
"Ochnaceae", "aspersor",
|
||||
"trailmaking", "Bishareen",
|
||||
"digitule", "octogynous",
|
||||
"epididymitis", "smokefarthings",
|
||||
"bacillite", "overcrown",
|
||||
"mangonism", "sirrah",
|
||||
"undecorated", "psychofugal",
|
||||
"bismuthiferous", "rechar",
|
||||
"Lemuridae", "frameable",
|
||||
"thiodiazole", "Scanic",
|
||||
"sportswomanship", "interruptedness",
|
||||
"admissory", "osteopaedion",
|
||||
"tingly", "tomorrowness",
|
||||
"ethnocracy", "trabecular",
|
||||
"vitally", "fossilism",
|
||||
"adz", "metopon",
|
||||
"prefatorial", "expiscate",
|
||||
"diathermacy", "chronist",
|
||||
"nigh", "generalizable",
|
||||
"hysterogen", "aurothiosulphuric",
|
||||
"whitlowwort", "downthrust",
|
||||
"Protestantize", "monander",
|
||||
"Itea", "chronographic",
|
||||
"silicize", "Dunlop",
|
||||
"eer", "componental",
|
||||
"spot", "pamphlet",
|
||||
"antineuritic", "paradisean",
|
||||
"interruptor", "debellator",
|
||||
"overcultured", "Florissant",
|
||||
"hyocholic", "pneumatotherapy",
|
||||
"tailoress", "rave",
|
||||
"unpeople", "Sebastian",
|
||||
"thermanesthesia", "Coniferae",
|
||||
"swacking", "posterishness",
|
||||
"ethmopalatal", "whittle",
|
||||
"analgize", "scabbardless",
|
||||
"naught", "symbiogenetically",
|
||||
"trip", "parodist",
|
||||
"columniform", "trunnel",
|
||||
"yawler", "goodwill",
|
||||
"pseudohalogen", "swangy",
|
||||
"cervisial", "mediateness",
|
||||
"genii", "imprescribable",
|
||||
"pony", "consumptional",
|
||||
"carposporangial", "poleax",
|
||||
"bestill", "subfebrile",
|
||||
"sapphiric", "arrowworm",
|
||||
"qualminess", "ultraobscure",
|
||||
"thorite", "Fouquieria",
|
||||
"Bermudian", "prescriber",
|
||||
"elemicin", "warlike",
|
||||
"semiangle", "rotular",
|
||||
"misthread", "returnability",
|
||||
"seraphism", "precostal",
|
||||
"quarried", "Babylonism",
|
||||
"sangaree", "seelful",
|
||||
"placatory", "pachydermous",
|
||||
"bozal", "galbulus",
|
||||
"spermaphyte", "cumbrousness",
|
||||
"pope", "signifier",
|
||||
"Endomycetaceae", "shallowish",
|
||||
"sequacity", "periarthritis",
|
||||
"bathysphere", "pentosuria",
|
||||
"Dadaism", "spookdom",
|
||||
"Consolamentum", "afterpressure",
|
||||
"mutter", "louse",
|
||||
"ovoviviparous", "corbel",
|
||||
"metastoma", "biventer",
|
||||
"Hydrangea", "hogmace",
|
||||
"seizing", "nonsuppressed",
|
||||
"oratorize", "uncarefully",
|
||||
"benzothiofuran", "penult",
|
||||
"balanocele", "macropterous",
|
||||
"dishpan", "marten",
|
||||
"absvolt", "jirble",
|
||||
"parmelioid", "airfreighter",
|
||||
"acocotl", "archesporial",
|
||||
"hypoplastral", "preoral",
|
||||
"quailberry", "cinque",
|
||||
"terrestrially", "stroking",
|
||||
"limpet", "moodishness",
|
||||
"canicule", "archididascalian",
|
||||
"pompiloid", "overstaid",
|
||||
"introducer", "Italical",
|
||||
"Christianopaganism", "prescriptible",
|
||||
"subofficer", "danseuse",
|
||||
"cloy", "saguran",
|
||||
"frictionlessly", "deindividualization",
|
||||
"Bulanda", "ventricous",
|
||||
"subfoliar", "basto",
|
||||
"scapuloradial", "suspend",
|
||||
"stiffish", "Sphenodontidae",
|
||||
"eternal", "verbid",
|
||||
"mammonish", "upcushion",
|
||||
"barkometer", "concretion",
|
||||
"preagitate", "incomprehensible",
|
||||
"tristich", "visceral",
|
||||
"hemimelus", "patroller",
|
||||
"stentorophonic", "pinulus",
|
||||
"kerykeion", "brutism",
|
||||
"monstership", "merciful",
|
||||
"overinstruct", "defensibly",
|
||||
"bettermost", "splenauxe",
|
||||
"Mormyrus", "unreprimanded",
|
||||
"taver", "ell",
|
||||
"proacquittal", "infestation",
|
||||
"overwoven", "Lincolnlike",
|
||||
"chacona", "Tamil",
|
||||
"classificational", "lebensraum",
|
||||
"reeveland", "intuition",
|
||||
"Whilkut", "focaloid",
|
||||
"Eleusinian", "micromembrane",
|
||||
"byroad", "nonrepetition",
|
||||
"bacterioblast", "brag",
|
||||
"ribaldrous", "phytoma",
|
||||
"counteralliance", "pelvimetry",
|
||||
"pelf", "relaster",
|
||||
"thermoresistant", "aneurism",
|
||||
"molossic", "euphonym",
|
||||
"upswell", "ladhood",
|
||||
"phallaceous", "inertly",
|
||||
"gunshop", "stereotypography",
|
||||
"laryngic", "refasten",
|
||||
"twinling", "oflete",
|
||||
"hepatorrhaphy", "electrotechnics",
|
||||
"cockal", "guitarist",
|
||||
"topsail", "Cimmerianism",
|
||||
"larklike", "Llandovery",
|
||||
"pyrocatechol", "immatchable",
|
||||
"chooser", "metrocratic",
|
||||
"craglike", "quadrennial",
|
||||
"nonpoisonous", "undercolored",
|
||||
"knob", "ultratense",
|
||||
"balladmonger", "slait",
|
||||
"sialadenitis", "bucketer",
|
||||
"magnificently", "unstipulated",
|
||||
"unscourged", "unsupercilious",
|
||||
"packsack", "pansophism",
|
||||
"soorkee", "percent",
|
||||
"subirrigate", "champer",
|
||||
"metapolitics", "spherulitic",
|
||||
"involatile", "metaphonical",
|
||||
"stachyuraceous", "speckedness",
|
||||
"bespin", "proboscidiform",
|
||||
"gul", "squit",
|
||||
"yeelaman", "peristeropode",
|
||||
"opacousness", "shibuichi",
|
||||
"retinize", "yote",
|
||||
"misexposition", "devilwise",
|
||||
"pumpkinification", "vinny",
|
||||
"bonze", "glossing",
|
||||
"decardinalize", "transcortical",
|
||||
"serphoid", "deepmost",
|
||||
"guanajuatite", "wemless",
|
||||
"arval", "lammy",
|
||||
"Effie", "Saponaria",
|
||||
"tetrahedral", "prolificy",
|
||||
"excerpt", "dunkadoo",
|
||||
"Spencerism", "insatiately",
|
||||
"Gilaki", "oratorship",
|
||||
"arduousness", "unbashfulness",
|
||||
"Pithecolobium", "unisexuality",
|
||||
"veterinarian", "detractive",
|
||||
"liquidity", "acidophile",
|
||||
"proauction", "sural",
|
||||
"totaquina", "Vichyite",
|
||||
"uninhabitedness", "allegedly",
|
||||
"Gothish", "manny",
|
||||
"Inger", "flutist",
|
||||
"ticktick", "Ludgatian",
|
||||
"homotransplant", "orthopedical",
|
||||
"diminutively", "monogoneutic",
|
||||
"Kenipsim", "sarcologist",
|
||||
"drome", "stronghearted",
|
||||
"Fameuse", "Swaziland",
|
||||
"alen", "chilblain",
|
||||
"beatable", "agglomeratic",
|
||||
"constitutor", "tendomucoid",
|
||||
"porencephalous", "arteriasis",
|
||||
"boser", "tantivy",
|
||||
"rede", "lineamental",
|
||||
"uncontradictableness", "homeotypical",
|
||||
"masa", "folious",
|
||||
"dosseret", "neurodegenerative",
|
||||
"subtransverse", "Chiasmodontidae",
|
||||
"palaeotheriodont", "unstressedly",
|
||||
"chalcites", "piquantness",
|
||||
"lampyrine", "Aplacentalia",
|
||||
"projecting", "elastivity",
|
||||
"isopelletierin", "bladderwort",
|
||||
"strander", "almud",
|
||||
"iniquitously", "theologal",
|
||||
"bugre", "chargeably",
|
||||
"imperceptivity", "meriquinoidal",
|
||||
"mesophyte", "divinator",
|
||||
"perfunctory", "counterappellant",
|
||||
"synovial", "charioteer",
|
||||
"crystallographical", "comprovincial",
|
||||
"infrastapedial", "pleasurehood",
|
||||
"inventurous", "ultrasystematic",
|
||||
"subangulated", "supraoesophageal",
|
||||
"Vaishnavism", "transude",
|
||||
"chrysochrous", "ungrave",
|
||||
"reconciliable", "uninterpleaded",
|
||||
"erlking", "wherefrom",
|
||||
"aprosopia", "antiadiaphorist",
|
||||
"metoxazine", "incalculable",
|
||||
"umbellic", "predebit",
|
||||
"foursquare", "unimmortal",
|
||||
"nonmanufacture", "slangy",
|
||||
"predisputant", "familist",
|
||||
"preaffiliate", "friarhood",
|
||||
"corelysis", "zoonitic",
|
||||
"halloo", "paunchy",
|
||||
"neuromimesis", "aconitine",
|
||||
"hackneyed", "unfeeble",
|
||||
"cubby", "autoschediastical",
|
||||
"naprapath", "lyrebird",
|
||||
"inexistency", "leucophoenicite",
|
||||
"ferrogoslarite", "reperuse",
|
||||
"uncombable", "tambo",
|
||||
"propodiale", "diplomatize",
|
||||
"Russifier", "clanned",
|
||||
"corona", "michigan",
|
||||
"nonutilitarian", "transcorporeal",
|
||||
"bought", "Cercosporella",
|
||||
"stapedius", "glandularly",
|
||||
"pictorially", "weism",
|
||||
"disilane", "rainproof",
|
||||
"Caphtor", "scrubbed",
|
||||
"oinomancy", "pseudoxanthine",
|
||||
"nonlustrous", "redesertion",
|
||||
"Oryzorictinae", "gala",
|
||||
"Mycogone", "reappreciate",
|
||||
"cyanoguanidine", "seeingness",
|
||||
"breadwinner", "noreast",
|
||||
"furacious", "epauliere",
|
||||
"omniscribent", "Passiflorales",
|
||||
"uninductive", "inductivity",
|
||||
"Orbitolina", "Semecarpus",
|
||||
"migrainoid", "steprelationship",
|
||||
"phlogisticate", "mesymnion",
|
||||
"sloped", "edificator",
|
||||
"beneficent", "culm",
|
||||
"paleornithology", "unurban",
|
||||
"throbless", "amplexifoliate",
|
||||
"sesquiquintile", "sapience",
|
||||
"astucious", "dithery",
|
||||
"boor", "ambitus",
|
||||
"scotching", "uloid",
|
||||
"uncompromisingness", "hoove",
|
||||
"waird", "marshiness",
|
||||
"Jerusalem", "mericarp",
|
||||
"unevoked", "benzoperoxide",
|
||||
"outguess", "pyxie",
|
||||
"hymnic", "euphemize",
|
||||
"mendacity", "erythremia",
|
||||
"rosaniline", "unchatteled",
|
||||
"lienteria", "Bushongo",
|
||||
"dialoguer", "unrepealably",
|
||||
"rivethead", "antideflation",
|
||||
"vinegarish", "manganosiderite",
|
||||
"doubtingness", "ovopyriform",
|
||||
"Cephalodiscus", "Muscicapa",
|
||||
"Animalivora", "angina",
|
||||
"planispheric", "ipomoein",
|
||||
"cuproiodargyrite", "sandbox",
|
||||
"scrat", "Munnopsidae",
|
||||
"shola", "pentafid",
|
||||
"overstudiousness", "times",
|
||||
"nonprofession", "appetible",
|
||||
"valvulotomy", "goladar",
|
||||
"uniarticular", "oxyterpene",
|
||||
"unlapsing", "omega",
|
||||
"trophonema", "seminonflammable",
|
||||
"circumzenithal", "starer",
|
||||
"depthwise", "liberatress",
|
||||
"unleavened", "unrevolting",
|
||||
"groundneedle", "topline",
|
||||
"wandoo", "umangite",
|
||||
"ordinant", "unachievable",
|
||||
"oversand", "snare",
|
||||
"avengeful", "unexplicit",
|
||||
"mustafina", "sonable",
|
||||
"rehabilitative", "eulogization",
|
||||
"papery", "technopsychology",
|
||||
"impressor", "cresylite",
|
||||
"entame", "transudatory",
|
||||
"scotale", "pachydermatoid",
|
||||
"imaginary", "yeat",
|
||||
"slipped", "stewardship",
|
||||
"adatom", "cockstone",
|
||||
"skyshine", "heavenful",
|
||||
"comparability", "exprobratory",
|
||||
"dermorhynchous", "parquet",
|
||||
"cretaceous", "vesperal",
|
||||
"raphis", "undangered",
|
||||
"Glecoma", "engrain",
|
||||
"counteractively", "Zuludom",
|
||||
"orchiocatabasis", "Auriculariales",
|
||||
"warriorwise", "extraorganismal",
|
||||
"overbuilt", "alveolite",
|
||||
"tetchy", "terrificness",
|
||||
"widdle", "unpremonished",
|
||||
"rebilling", "sequestrum",
|
||||
"equiconvex", "heliocentricism",
|
||||
"catabaptist", "okonite",
|
||||
"propheticism", "helminthagogic",
|
||||
"calycular", "giantly",
|
||||
"wingable", "golem",
|
||||
"unprovided", "commandingness",
|
||||
"greave", "haply",
|
||||
"doina", "depressingly",
|
||||
"subdentate", "impairment",
|
||||
"decidable", "neurotrophic",
|
||||
"unpredict", "bicorporeal",
|
||||
"pendulant", "flatman",
|
||||
"intrabred", "toplike",
|
||||
"Prosobranchiata", "farrantly",
|
||||
"toxoplasmosis", "gorilloid",
|
||||
"dipsomaniacal", "aquiline",
|
||||
"atlantite", "ascitic",
|
||||
"perculsive", "prospectiveness",
|
||||
"saponaceous", "centrifugalization",
|
||||
"dinical", "infravaginal",
|
||||
"beadroll", "affaite",
|
||||
"Helvidian", "tickleproof",
|
||||
"abstractionism", "enhedge",
|
||||
"outwealth", "overcontribute",
|
||||
"coldfinch", "gymnastic",
|
||||
"Pincian", "Munychian",
|
||||
"codisjunct", "quad",
|
||||
"coracomandibular", "phoenicochroite",
|
||||
"amender", "selectivity",
|
||||
"putative", "semantician",
|
||||
"lophotrichic", "Spatangoidea",
|
||||
"saccharogenic", "inferent",
|
||||
"Triconodonta", "arrendation",
|
||||
"sheepskin", "taurocolla",
|
||||
"bunghole", "Machiavel",
|
||||
"triakistetrahedral", "dehairer",
|
||||
"prezygapophysial", "cylindric",
|
||||
"pneumonalgia", "sleigher",
|
||||
"emir", "Socraticism",
|
||||
"licitness", "massedly",
|
||||
"instructiveness", "sturdied",
|
||||
"redecrease", "starosta",
|
||||
"evictor", "orgiastic",
|
||||
"squdge", "meloplasty",
|
||||
"Tsonecan", "repealableness",
|
||||
"swoony", "myesthesia",
|
||||
"molecule", "autobiographist",
|
||||
"reciprocation", "refective",
|
||||
"unobservantness", "tricae",
|
||||
"ungouged", "floatability",
|
||||
"Mesua", "fetlocked",
|
||||
"chordacentrum", "sedentariness",
|
||||
"various", "laubanite",
|
||||
"nectopod", "zenick",
|
||||
"sequentially", "analgic",
|
||||
"biodynamics", "posttraumatic",
|
||||
"nummi", "pyroacetic",
|
||||
"bot", "redescend",
|
||||
"dispermy", "undiffusive",
|
||||
"circular", "trillion",
|
||||
"Uraniidae", "ploration",
|
||||
"discipular", "potentness",
|
||||
"sud", "Hu",
|
||||
"Eryon", "plugger",
|
||||
"subdrainage", "jharal",
|
||||
"abscission", "supermarket",
|
||||
"countergabion", "glacierist",
|
||||
"lithotresis", "minniebush",
|
||||
"zanyism", "eucalypteol",
|
||||
"sterilely", "unrealize",
|
||||
"unpatched", "hypochondriacism",
|
||||
"critically", "cheesecutter",
|
||||
};
|
||||
}
|
|
@ -0,0 +1,298 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.mapreduce;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.BytesWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.hadoop.io.WritableComparable;
|
||||
import org.apache.hadoop.mapred.ClusterStatus;
|
||||
import org.apache.hadoop.mapred.JobClient;
|
||||
import org.apache.hadoop.mapreduce.*;
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.hadoop.util.GenericOptionsParser;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
/**
|
||||
* This program uses map/reduce to just run a distributed job where there is
|
||||
* no interaction between the tasks and each task write a large unsorted
|
||||
* random binary sequence file of BytesWritable.
|
||||
* In order for this program to generate data for terasort with 10-byte keys
|
||||
* and 90-byte values, have the following config:
|
||||
* <xmp>
|
||||
* <?xml version="1.0"?>
|
||||
* <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
* <configuration>
|
||||
* <property>
|
||||
* <name>mapreduce.randomwriter.minkey</name>
|
||||
* <value>10</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomwriter.maxkey</name>
|
||||
* <value>10</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomwriter.minvalue</name>
|
||||
* <value>90</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomwriter.maxvalue</name>
|
||||
* <value>90</value>
|
||||
* </property>
|
||||
* <property>
|
||||
* <name>mapreduce.randomwriter.totalbytes</name>
|
||||
* <value>1099511627776</value>
|
||||
* </property>
|
||||
* </configuration></xmp>
|
||||
*
|
||||
* Equivalently, {@link RandomWriter} also supports all the above options
|
||||
* and ones supported by {@link GenericOptionsParser} via the command-line.
|
||||
*/
|
||||
public class RandomWriter extends Configured implements Tool {
|
||||
public static final String TOTAL_BYTES = "mapreduce.randomwriter.totalbytes";
|
||||
public static final String BYTES_PER_MAP =
|
||||
"mapreduce.randomwriter.bytespermap";
|
||||
public static final String MAPS_PER_HOST =
|
||||
"mapreduce.randomwriter.mapsperhost";
|
||||
public static final String MAX_VALUE = "mapreduce.randomwriter.maxvalue";
|
||||
public static final String MIN_VALUE = "mapreduce.randomwriter.minvalue";
|
||||
public static final String MIN_KEY = "mapreduce.randomwriter.minkey";
|
||||
public static final String MAX_KEY = "mapreduce.randomwriter.maxkey";
|
||||
|
||||
/**
|
||||
* User counters
|
||||
*/
|
||||
static enum Counters { RECORDS_WRITTEN, BYTES_WRITTEN }
|
||||
|
||||
/**
|
||||
* A custom input format that creates virtual inputs of a single string
|
||||
* for each map.
|
||||
*/
|
||||
static class RandomInputFormat extends InputFormat<Text, Text> {
|
||||
|
||||
/**
|
||||
* Generate the requested number of file splits, with the filename
|
||||
* set to the filename of the output file.
|
||||
*/
|
||||
public List<InputSplit> getSplits(JobContext job) throws IOException {
|
||||
List<InputSplit> result = new ArrayList<InputSplit>();
|
||||
Path outDir = FileOutputFormat.getOutputPath(job);
|
||||
int numSplits =
|
||||
job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
|
||||
for(int i=0; i < numSplits; ++i) {
|
||||
result.add(new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1,
|
||||
(String[])null));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a single record (filename, "") where the filename is taken from
|
||||
* the file split.
|
||||
*/
|
||||
static class RandomRecordReader extends RecordReader<Text, Text> {
|
||||
Path name;
|
||||
Text key = null;
|
||||
Text value = new Text();
|
||||
public RandomRecordReader(Path p) {
|
||||
name = p;
|
||||
}
|
||||
|
||||
public void initialize(InputSplit split,
|
||||
TaskAttemptContext context)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
}
|
||||
|
||||
public boolean nextKeyValue() {
|
||||
if (name != null) {
|
||||
key = new Text();
|
||||
key.set(name.getName());
|
||||
name = null;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Text getCurrentKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public Text getCurrentValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void close() {}
|
||||
|
||||
public float getProgress() {
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
public RecordReader<Text, Text> createRecordReader(InputSplit split,
|
||||
TaskAttemptContext context) throws IOException, InterruptedException {
|
||||
return new RandomRecordReader(((FileSplit) split).getPath());
|
||||
}
|
||||
}
|
||||
|
||||
static class RandomMapper extends Mapper<WritableComparable, Writable,
|
||||
BytesWritable, BytesWritable> {
|
||||
|
||||
private long numBytesToWrite;
|
||||
private int minKeySize;
|
||||
private int keySizeRange;
|
||||
private int minValueSize;
|
||||
private int valueSizeRange;
|
||||
private Random random = new Random();
|
||||
private BytesWritable randomKey = new BytesWritable();
|
||||
private BytesWritable randomValue = new BytesWritable();
|
||||
|
||||
private void randomizeBytes(byte[] data, int offset, int length) {
|
||||
for(int i=offset + length - 1; i >= offset; --i) {
|
||||
data[i] = (byte) random.nextInt(256);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an output filename, write a bunch of random records to it.
|
||||
*/
|
||||
public void map(WritableComparable key,
|
||||
Writable value,
|
||||
Context context) throws IOException,InterruptedException {
|
||||
int itemCount = 0;
|
||||
while (numBytesToWrite > 0) {
|
||||
int keyLength = minKeySize +
|
||||
(keySizeRange != 0 ? random.nextInt(keySizeRange) : 0);
|
||||
randomKey.setSize(keyLength);
|
||||
randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength());
|
||||
int valueLength = minValueSize +
|
||||
(valueSizeRange != 0 ? random.nextInt(valueSizeRange) : 0);
|
||||
randomValue.setSize(valueLength);
|
||||
randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength());
|
||||
context.write(randomKey, randomValue);
|
||||
numBytesToWrite -= keyLength + valueLength;
|
||||
context.getCounter(Counters.BYTES_WRITTEN).increment(keyLength + valueLength);
|
||||
context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
|
||||
if (++itemCount % 200 == 0) {
|
||||
context.setStatus("wrote record " + itemCount + ". " +
|
||||
numBytesToWrite + " bytes left.");
|
||||
}
|
||||
}
|
||||
context.setStatus("done with " + itemCount + " records.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Save the values out of the configuaration that we need to write
|
||||
* the data.
|
||||
*/
|
||||
@Override
|
||||
public void setup(Context context) {
|
||||
Configuration conf = context.getConfiguration();
|
||||
numBytesToWrite = conf.getLong(BYTES_PER_MAP,
|
||||
1*1024*1024*1024);
|
||||
minKeySize = conf.getInt(MIN_KEY, 10);
|
||||
keySizeRange =
|
||||
conf.getInt(MAX_KEY, 1000) - minKeySize;
|
||||
minValueSize = conf.getInt(MIN_VALUE, 0);
|
||||
valueSizeRange =
|
||||
conf.getInt(MAX_VALUE, 20000) - minValueSize;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the main routine for launching a distributed random write job.
|
||||
* It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
|
||||
* The reduce doesn't do anything.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
public int run(String[] args) throws Exception {
|
||||
if (args.length == 0) {
|
||||
System.out.println("Usage: writer <out-dir>");
|
||||
ToolRunner.printGenericCommandUsage(System.out);
|
||||
return 2;
|
||||
}
|
||||
|
||||
Path outDir = new Path(args[0]);
|
||||
Configuration conf = getConf();
|
||||
JobClient client = new JobClient(conf);
|
||||
ClusterStatus cluster = client.getClusterStatus();
|
||||
int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
|
||||
long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
|
||||
1*1024*1024*1024);
|
||||
if (numBytesToWritePerMap == 0) {
|
||||
System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
|
||||
return -2;
|
||||
}
|
||||
long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
|
||||
numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
|
||||
int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
|
||||
if (numMaps == 0 && totalBytesToWrite > 0) {
|
||||
numMaps = 1;
|
||||
conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
|
||||
}
|
||||
conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
|
||||
|
||||
Job job = new Job(conf);
|
||||
|
||||
job.setJarByClass(RandomWriter.class);
|
||||
job.setJobName("random-writer");
|
||||
FileOutputFormat.setOutputPath(job, outDir);
|
||||
job.setOutputKeyClass(BytesWritable.class);
|
||||
job.setOutputValueClass(BytesWritable.class);
|
||||
job.setInputFormatClass(RandomInputFormat.class);
|
||||
job.setMapperClass(RandomMapper.class);
|
||||
job.setReducerClass(Reducer.class);
|
||||
job.setOutputFormatClass(SequenceFileOutputFormat.class);
|
||||
|
||||
System.out.println("Running " + numMaps + " maps.");
|
||||
|
||||
// reducer NONE
|
||||
job.setNumReduceTasks(0);
|
||||
|
||||
Date startTime = new Date();
|
||||
System.out.println("Job started: " + startTime);
|
||||
int ret = job.waitForCompletion(true) ? 0 : 1;
|
||||
Date endTime = new Date();
|
||||
System.out.println("Job ended: " + endTime);
|
||||
System.out.println("The job took " +
|
||||
(endTime.getTime() - startTime.getTime()) /1000 +
|
||||
" seconds.");
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
int res = ToolRunner.run(new Configuration(), new RandomWriter(), args);
|
||||
System.exit(res);
|
||||
}
|
||||
|
||||
}
|
|
@ -19,7 +19,9 @@
|
|||
package org.apache.hadoop.mapreduce.lib.input;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.*;
|
||||
|
@ -28,10 +30,15 @@ import static org.mockito.Mockito.*;
|
|||
import static org.apache.hadoop.test.MockitoMaker.*;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.BlockLocation;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapreduce.InputSplit;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.JobContext;
|
||||
import org.apache.hadoop.mapreduce.RecordReader;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||
|
||||
public class TestFileInputFormat {
|
||||
|
||||
|
@ -80,4 +87,108 @@ public class TestFileInputFormat {
|
|||
ispy.getSplits(job);
|
||||
verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||
public void testLastInputSplitAtSplitBoundary() throws Exception {
|
||||
FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024,
|
||||
128l * 1024 * 1024);
|
||||
Configuration conf = new Configuration();
|
||||
JobContext jobContext = mock(JobContext.class);
|
||||
when(jobContext.getConfiguration()).thenReturn(conf);
|
||||
List<InputSplit> splits = fif.getSplits(jobContext);
|
||||
assertEquals(8, splits.size());
|
||||
for (int i = 0 ; i < splits.size() ; i++) {
|
||||
InputSplit split = splits.get(i);
|
||||
assertEquals(("host" + i), split.getLocations()[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@SuppressWarnings({ "rawtypes", "unchecked" })
|
||||
public void testLastInputSplitExceedingSplitBoundary() throws Exception {
|
||||
FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024,
|
||||
128l * 1024 * 1024);
|
||||
Configuration conf = new Configuration();
|
||||
JobContext jobContext = mock(JobContext.class);
|
||||
when(jobContext.getConfiguration()).thenReturn(conf);
|
||||
List<InputSplit> splits = fif.getSplits(jobContext);
|
||||
assertEquals(8, splits.size());
|
||||
for (int i = 0; i < splits.size(); i++) {
|
||||
InputSplit split = splits.get(i);
|
||||
assertEquals(("host" + i), split.getLocations()[0]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@SuppressWarnings({ "rawtypes", "unchecked" })
|
||||
public void testLastInputSplitSingleSplit() throws Exception {
|
||||
FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024,
|
||||
128l * 1024 * 1024);
|
||||
Configuration conf = new Configuration();
|
||||
JobContext jobContext = mock(JobContext.class);
|
||||
when(jobContext.getConfiguration()).thenReturn(conf);
|
||||
List<InputSplit> splits = fif.getSplits(jobContext);
|
||||
assertEquals(1, splits.size());
|
||||
for (int i = 0; i < splits.size(); i++) {
|
||||
InputSplit split = splits.get(i);
|
||||
assertEquals(("host" + i), split.getLocations()[0]);
|
||||
}
|
||||
}
|
||||
|
||||
private class FileInputFormatForTest<K, V> extends FileInputFormat<K, V> {
|
||||
|
||||
long splitSize;
|
||||
long length;
|
||||
|
||||
FileInputFormatForTest(long length, long splitSize) {
|
||||
this.length = length;
|
||||
this.splitSize = splitSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<K, V> createRecordReader(InputSplit split,
|
||||
TaskAttemptContext context) throws IOException, InterruptedException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<FileStatus> listStatus(JobContext job) throws IOException {
|
||||
FileStatus mockFileStatus = mock(FileStatus.class);
|
||||
when(mockFileStatus.getBlockSize()).thenReturn(splitSize);
|
||||
Path mockPath = mock(Path.class);
|
||||
FileSystem mockFs = mock(FileSystem.class);
|
||||
|
||||
BlockLocation[] blockLocations = mockBlockLocations(length, splitSize);
|
||||
when(mockFs.getFileBlockLocations(mockFileStatus, 0, length)).thenReturn(
|
||||
blockLocations);
|
||||
when(mockPath.getFileSystem(any(Configuration.class))).thenReturn(mockFs);
|
||||
|
||||
when(mockFileStatus.getPath()).thenReturn(mockPath);
|
||||
when(mockFileStatus.getLen()).thenReturn(length);
|
||||
|
||||
List<FileStatus> list = new ArrayList<FileStatus>();
|
||||
list.add(mockFileStatus);
|
||||
return list;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
|
||||
return splitSize;
|
||||
}
|
||||
|
||||
private BlockLocation[] mockBlockLocations(long size, long splitSize) {
|
||||
int numLocations = (int) (size / splitSize);
|
||||
if (size % splitSize != 0)
|
||||
numLocations++;
|
||||
BlockLocation[] blockLocations = new BlockLocation[numLocations];
|
||||
for (int i = 0; i < numLocations; i++) {
|
||||
String[] names = new String[] { "b" + i };
|
||||
String[] hosts = new String[] { "host" + i };
|
||||
blockLocations[i] = new BlockLocation(names, hosts, i * splitSize,
|
||||
Math.min(splitSize, size - (splitSize * i)));
|
||||
}
|
||||
return blockLocations;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -238,9 +238,11 @@ public class TeraGen extends Configured implements Tool {
|
|||
|
||||
@Override
|
||||
public void cleanup(Context context) {
|
||||
if (checksumCounter != null) {
|
||||
checksumCounter.increment(total.getLow8());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void usage() throws IOException {
|
||||
System.err.println("teragen <num rows> <output dir>");
|
||||
|
@ -307,5 +309,4 @@ public class TeraGen extends Configured implements Tool {
|
|||
int res = ToolRunner.run(new Configuration(), new TeraGen(), args);
|
||||
System.exit(res);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -85,20 +85,6 @@ public interface ApplicationConstants {
|
|||
|
||||
public static final String STDOUT = "stdout";
|
||||
|
||||
/**
|
||||
* Classpath for typical applications.
|
||||
*/
|
||||
public static final String[] APPLICATION_CLASSPATH =
|
||||
new String[] {
|
||||
"$HADOOP_CONF_DIR",
|
||||
"$HADOOP_COMMON_HOME/share/hadoop/common/*",
|
||||
"$HADOOP_COMMON_HOME/share/hadoop/common/lib/*",
|
||||
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/*",
|
||||
"$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*",
|
||||
"$YARN_HOME/share/hadoop/mapreduce/*",
|
||||
"$YARN_HOME/share/hadoop/mapreduce/lib/*"
|
||||
};
|
||||
|
||||
/**
|
||||
* Environment for Applications.
|
||||
*
|
||||
|
|
|
@ -508,6 +508,10 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
|
||||
2000;
|
||||
|
||||
/** Standard Hadoop classes */
|
||||
public static final String YARN_APPLICATION_CLASSPATH = YARN_PREFIX
|
||||
+ "application.classpath";
|
||||
|
||||
public YarnConfiguration() {
|
||||
super();
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ import com.google.common.collect.Lists;
|
|||
import com.google.inject.Provides;
|
||||
import com.google.inject.servlet.GuiceFilter;
|
||||
import com.google.inject.servlet.ServletModule;
|
||||
import com.sun.jersey.api.container.filter.GZIPContentEncodingFilter;
|
||||
import com.sun.jersey.api.core.ResourceConfig;
|
||||
import com.sun.jersey.core.util.FeaturesAndProperties;
|
||||
import com.sun.jersey.guice.spi.container.servlet.GuiceContainer;
|
||||
|
@ -160,6 +161,8 @@ public abstract class WebApp extends ServletModule {
|
|||
params.put(ResourceConfig.FEATURE_IMPLICIT_VIEWABLES, "true");
|
||||
params.put(ServletContainer.FEATURE_FILTER_FORWARD_ON_404, "true");
|
||||
params.put(FeaturesAndProperties.FEATURE_XMLROOTELEMENT_PROCESSING, "true");
|
||||
params.put(ResourceConfig.PROPERTY_CONTAINER_REQUEST_FILTERS, GZIPContentEncodingFilter.class.getName());
|
||||
params.put(ResourceConfig.PROPERTY_CONTAINER_RESPONSE_FILTERS, GZIPContentEncodingFilter.class.getName());
|
||||
filter("/*").through(GuiceContainer.class, params);
|
||||
}
|
||||
|
||||
|
|
|
@ -482,4 +482,18 @@
|
|||
<name>yarn.web-proxy.address</name>
|
||||
<value/>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<description>Classpath for typical applications.</description>
|
||||
<name>yarn.application.classpath</name>
|
||||
<value>
|
||||
$HADOOP_CONF_DIR,
|
||||
$HADOOP_COMMON_HOME/share/hadoop/common/*,
|
||||
$HADOOP_COMMON_HOME/share/hadoop/common/lib/*,
|
||||
$HADOOP_HDFS_HOME/share/hadoop/hdfs/*,
|
||||
$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
|
||||
$YARN_HOME/share/hadoop/mapreduce/*,
|
||||
$YARN_HOME/share/hadoop/mapreduce/lib/*
|
||||
</value>
|
||||
</property>
|
||||
</configuration>
|
||||
|
|
|
@ -295,10 +295,6 @@ public class SchedulerApp {
|
|||
}
|
||||
}
|
||||
|
||||
public synchronized void setAvailableResourceLimit(Resource globalLimit) {
|
||||
this.resourceLimit = globalLimit;
|
||||
}
|
||||
|
||||
public synchronized RMContainer getRMContainer(ContainerId id) {
|
||||
return liveContainers.get(id);
|
||||
}
|
||||
|
@ -446,20 +442,21 @@ public class SchedulerApp {
|
|||
return reservedContainers;
|
||||
}
|
||||
|
||||
public synchronized void setHeadroom(Resource globalLimit) {
|
||||
this.resourceLimit = globalLimit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get available headroom in terms of resources for the application's user.
|
||||
* @return available resource headroom
|
||||
*/
|
||||
public synchronized Resource getHeadroom() {
|
||||
Resource limit = Resources.subtract(resourceLimit, currentConsumption);
|
||||
Resources.subtractFrom(limit, currentReservation);
|
||||
|
||||
// Corner case to deal with applications being slightly over-limit
|
||||
if (limit.getMemory() < 0) {
|
||||
limit.setMemory(0);
|
||||
if (resourceLimit.getMemory() < 0) {
|
||||
resourceLimit.setMemory(0);
|
||||
}
|
||||
|
||||
return limit;
|
||||
return resourceLimit;
|
||||
}
|
||||
|
||||
public Queue getQueue() {
|
||||
|
|
|
@ -17,12 +17,19 @@
|
|||
*/
|
||||
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
|
||||
class CSQueueUtils {
|
||||
|
||||
public static void checkMaxCapacity(String queueName,
|
||||
float capacity, float maximumCapacity) {
|
||||
if (Math.round(100 * maximumCapacity) != CapacitySchedulerConfiguration.UNDEFINED &&
|
||||
if (maximumCapacity < 0.0f || maximumCapacity > 1.0f ||
|
||||
maximumCapacity < capacity) {
|
||||
throw new IllegalArgumentException(
|
||||
"Illegal value of maximumCapacity " + maximumCapacity +
|
||||
" used in call to setMaxCapacity for queue " + queueName);
|
||||
}
|
||||
if (maximumCapacity < capacity) {
|
||||
throw new IllegalArgumentException(
|
||||
"Illegal call to setMaxCapacity. " +
|
||||
"Queue '" + queueName + "' has " +
|
||||
|
@ -31,4 +38,25 @@ class CSQueueUtils {
|
|||
}
|
||||
}
|
||||
|
||||
public static float computeAbsoluteMaximumCapacity(
|
||||
float maximumCapacity, CSQueue parent) {
|
||||
float parentAbsMaxCapacity =
|
||||
(parent == null) ? 1.0f : parent.getAbsoluteMaximumCapacity();
|
||||
return (parentAbsMaxCapacity * maximumCapacity);
|
||||
}
|
||||
|
||||
public static int computeMaxActiveApplications(Resource clusterResource,
|
||||
float maxAMResourcePercent, float absoluteCapacity) {
|
||||
return
|
||||
Math.max(
|
||||
(int)((clusterResource.getMemory() / (float)LeafQueue.DEFAULT_AM_RESOURCE) *
|
||||
maxAMResourcePercent * absoluteCapacity),
|
||||
1);
|
||||
}
|
||||
|
||||
public static int computeMaxActiveApplicationsPerUser(
|
||||
int maxActiveApplications, int userLimit, float userLimitFactor) {
|
||||
return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -149,7 +149,7 @@ public class CapacitySchedulerConfiguration extends Configuration {
|
|||
throw new IllegalArgumentException("Illegal " +
|
||||
"capacity of " + capacity + " for queue " + queue);
|
||||
}
|
||||
LOG.debug("CSConf - setCapacity: queuePrefix=" + getQueuePrefix(queue) +
|
||||
LOG.debug("CSConf - getCapacity: queuePrefix=" + getQueuePrefix(queue) +
|
||||
", capacity=" + capacity);
|
||||
return capacity;
|
||||
}
|
||||
|
@ -162,11 +162,15 @@ public class CapacitySchedulerConfiguration extends Configuration {
|
|||
|
||||
public int getMaximumCapacity(String queue) {
|
||||
int maxCapacity =
|
||||
getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, UNDEFINED);
|
||||
getInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, MAXIMUM_CAPACITY_VALUE);
|
||||
return maxCapacity;
|
||||
}
|
||||
|
||||
public void setMaximumCapacity(String queue, int maxCapacity) {
|
||||
if (maxCapacity > MAXIMUM_CAPACITY_VALUE) {
|
||||
throw new IllegalArgumentException("Illegal " +
|
||||
"maximum-capacity of " + maxCapacity + " for queue " + queue);
|
||||
}
|
||||
setInt(getQueuePrefix(queue) + MAXIMUM_CAPACITY, maxCapacity);
|
||||
LOG.debug("CSConf - setMaxCapacity: queuePrefix=" + getQueuePrefix(queue) +
|
||||
", maxCapacity=" + maxCapacity);
|
||||
|
|
|
@ -144,10 +144,10 @@ public class LeafQueue implements CSQueue {
|
|||
(float)cs.getConfiguration().getCapacity(getQueuePath()) / 100;
|
||||
float absoluteCapacity = parent.getAbsoluteCapacity() * capacity;
|
||||
|
||||
float maximumCapacity = (float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
|
||||
float maximumCapacity =
|
||||
(float)cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
|
||||
float absoluteMaxCapacity =
|
||||
(Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ?
|
||||
Float.MAX_VALUE : (parent.getAbsoluteCapacity() * maximumCapacity);
|
||||
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
|
||||
|
||||
int userLimit = cs.getConfiguration().getUserLimit(getQueuePath());
|
||||
float userLimitFactor =
|
||||
|
@ -161,10 +161,10 @@ public class LeafQueue implements CSQueue {
|
|||
this.maxAMResourcePercent =
|
||||
cs.getConfiguration().getMaximumApplicationMasterResourcePercent();
|
||||
int maxActiveApplications =
|
||||
computeMaxActiveApplications(cs.getClusterResources(),
|
||||
CSQueueUtils.computeMaxActiveApplications(cs.getClusterResources(),
|
||||
maxAMResourcePercent, absoluteCapacity);
|
||||
int maxActiveApplicationsPerUser =
|
||||
computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
|
||||
CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
|
||||
userLimitFactor);
|
||||
|
||||
this.queueInfo = recordFactory.newRecordInstance(QueueInfo.class);
|
||||
|
@ -193,20 +193,6 @@ public class LeafQueue implements CSQueue {
|
|||
this.activeApplications = new TreeSet<SchedulerApp>(applicationComparator);
|
||||
}
|
||||
|
||||
private int computeMaxActiveApplications(Resource clusterResource,
|
||||
float maxAMResourcePercent, float absoluteCapacity) {
|
||||
return
|
||||
Math.max(
|
||||
(int)((clusterResource.getMemory() / (float)DEFAULT_AM_RESOURCE) *
|
||||
maxAMResourcePercent * absoluteCapacity),
|
||||
1);
|
||||
}
|
||||
|
||||
private int computeMaxActiveApplicationsPerUser(int maxActiveApplications,
|
||||
int userLimit, float userLimitFactor) {
|
||||
return (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor);
|
||||
}
|
||||
|
||||
private synchronized void setupQueueConfigs(
|
||||
float capacity, float absoluteCapacity,
|
||||
float maximumCapacity, float absoluteMaxCapacity,
|
||||
|
@ -254,8 +240,8 @@ public class LeafQueue implements CSQueue {
|
|||
"maxCapacity = " + maximumCapacity +
|
||||
" [= configuredMaxCapacity ]" + "\n" +
|
||||
"absoluteMaxCapacity = " + absoluteMaxCapacity +
|
||||
" [= Float.MAX_VALUE if maximumCapacity undefined, " +
|
||||
"(parentAbsoluteCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
|
||||
" [= 1.0 maximumCapacity undefined, " +
|
||||
"(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" + "\n" +
|
||||
"userLimit = " + userLimit +
|
||||
" [= configuredUserLimit ]" + "\n" +
|
||||
"userLimitFactor = " + userLimitFactor +
|
||||
|
@ -272,9 +258,9 @@ public class LeafQueue implements CSQueue {
|
|||
"maxActiveApplicationsPerUser = " + maxActiveApplicationsPerUser +
|
||||
" [= (int)(maxActiveApplications * (userLimit / 100.0f) * userLimitFactor) ]" + "\n" +
|
||||
"utilization = " + utilization +
|
||||
" [= usedResourcesMemory / queueLimit ]" + "\n" +
|
||||
" [= usedResourcesMemory / (clusterResourceMemory * absoluteCapacity)]" + "\n" +
|
||||
"usedCapacity = " + usedCapacity +
|
||||
" [= usedResourcesMemory / (clusterResourceMemory * capacity) ]" + "\n" +
|
||||
" [= usedResourcesMemory / (clusterResourceMemory * parent.absoluteCapacity)]" + "\n" +
|
||||
"maxAMResourcePercent = " + maxAMResourcePercent +
|
||||
" [= configuredMaximumAMResourcePercent ]" + "\n" +
|
||||
"minimumAllocationFactor = " + minimumAllocationFactor +
|
||||
|
@ -400,9 +386,7 @@ public class LeafQueue implements CSQueue {
|
|||
|
||||
this.maximumCapacity = maximumCapacity;
|
||||
this.absoluteMaxCapacity =
|
||||
(Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ?
|
||||
Float.MAX_VALUE :
|
||||
(parent.getAbsoluteCapacity() * maximumCapacity);
|
||||
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -502,9 +486,14 @@ public class LeafQueue implements CSQueue {
|
|||
}
|
||||
|
||||
public String toString() {
|
||||
return queueName + ":" + capacity + ":" + absoluteCapacity + ":" +
|
||||
getUsedCapacity() + ":" + getUtilization() + ":" +
|
||||
getNumApplications() + ":" + getNumContainers();
|
||||
return queueName + ": " +
|
||||
"capacity=" + capacity + ", " +
|
||||
"absoluteCapacity=" + absoluteCapacity + ", " +
|
||||
"usedResources=" + usedResources.getMemory() + "MB, " +
|
||||
"usedCapacity=" + getUsedCapacity() + ", " +
|
||||
"utilization=" + getUtilization() + ", " +
|
||||
"numApps=" + getNumApplications() + ", " +
|
||||
"numContainers=" + getNumContainers();
|
||||
}
|
||||
|
||||
private synchronized User getUser(String userName) {
|
||||
|
@ -731,12 +720,11 @@ public class LeafQueue implements CSQueue {
|
|||
if(LOG.isDebugEnabled()) {
|
||||
LOG.debug("pre-assignContainers for application "
|
||||
+ application.getApplicationId());
|
||||
}
|
||||
application.showRequests();
|
||||
}
|
||||
|
||||
synchronized (application) {
|
||||
computeAndSetUserResourceLimit(application, clusterResource);
|
||||
|
||||
// Schedule in priority order
|
||||
for (Priority priority : application.getPriorities()) {
|
||||
// Required resource
|
||||
Resource required =
|
||||
|
@ -747,15 +735,21 @@ public class LeafQueue implements CSQueue {
|
|||
continue;
|
||||
}
|
||||
|
||||
// Are we going over limits by allocating to this application?
|
||||
// Maximum Capacity of the queue
|
||||
// Compute & set headroom
|
||||
// Note: We set the headroom with the highest priority request
|
||||
// as the target.
|
||||
// This works since we never assign lower priority requests
|
||||
// before all higher priority ones are serviced.
|
||||
Resource userLimit =
|
||||
computeAndSetUserResourceLimit(application, clusterResource,
|
||||
required);
|
||||
|
||||
// Check queue max-capacity limit
|
||||
if (!assignToQueue(clusterResource, required)) {
|
||||
return NULL_ASSIGNMENT;
|
||||
}
|
||||
|
||||
// User limits
|
||||
Resource userLimit =
|
||||
computeUserLimit(application, clusterResource, required);
|
||||
// Check user limit
|
||||
if (!assignToUser(application.getUser(), userLimit)) {
|
||||
break;
|
||||
}
|
||||
|
@ -830,25 +824,28 @@ public class LeafQueue implements CSQueue {
|
|||
float potentialNewCapacity =
|
||||
(float)(usedResources.getMemory() + required.getMemory()) /
|
||||
clusterResource.getMemory();
|
||||
if (potentialNewCapacity > absoluteMaxCapacity) {
|
||||
LOG.info(getQueueName() +
|
||||
" usedResources: " + usedResources.getMemory() +
|
||||
" clusterResources: " + clusterResource.getMemory() +
|
||||
" currentCapacity " + ((float)usedResources.getMemory())/clusterResource.getMemory() +
|
||||
" required " + required.getMemory() +
|
||||
" potentialNewCapacity: " + potentialNewCapacity + " ( " +
|
||||
" max-capacity: " + absoluteMaxCapacity + ")");
|
||||
if (potentialNewCapacity > absoluteMaxCapacity) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private void computeAndSetUserResourceLimit(SchedulerApp application,
|
||||
Resource clusterResource) {
|
||||
Resource userLimit =
|
||||
computeUserLimit(application, clusterResource, Resources.none());
|
||||
application.setAvailableResourceLimit(userLimit);
|
||||
metrics.setAvailableResourcesToUser(application.getUser(),
|
||||
application.getHeadroom());
|
||||
private Resource computeAndSetUserResourceLimit(SchedulerApp application,
|
||||
Resource clusterResource, Resource required) {
|
||||
String user = application.getUser();
|
||||
Resource limit = computeUserLimit(application, clusterResource, required);
|
||||
Resource headroom =
|
||||
Resources.subtract(limit, getUser(user).getConsumedResources());
|
||||
application.setHeadroom(headroom);
|
||||
metrics.setAvailableResourcesToUser(user, headroom);
|
||||
return limit;
|
||||
}
|
||||
|
||||
private int roundUp(int memory) {
|
||||
|
@ -919,7 +916,7 @@ public class LeafQueue implements CSQueue {
|
|||
User user = getUser(userName);
|
||||
|
||||
// Note: We aren't considering the current request since there is a fixed
|
||||
// overhead of the AM, but it's a >= check, so...
|
||||
// overhead of the AM, but it's a > check, not a >= check, so...
|
||||
if ((user.getConsumedResources().getMemory()) > limit.getMemory()) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("User " + userName + " in queue " + getQueueName() +
|
||||
|
@ -1237,8 +1234,8 @@ public class LeafQueue implements CSQueue {
|
|||
// happen under scheduler's lock...
|
||||
// So, this is, in effect, a transaction across application & node
|
||||
if (rmContainer.getState() == RMContainerState.RESERVED) {
|
||||
application.unreserve(node, rmContainer.getReservedPriority());
|
||||
node.unreserveResource(application);
|
||||
unreserve(application, rmContainer.getReservedPriority(),
|
||||
node, rmContainer);
|
||||
} else {
|
||||
application.containerCompleted(rmContainer, containerStatus, event);
|
||||
node.releaseContainer(container);
|
||||
|
@ -1303,23 +1300,24 @@ public class LeafQueue implements CSQueue {
|
|||
public synchronized void updateClusterResource(Resource clusterResource) {
|
||||
// Update queue properties
|
||||
maxActiveApplications =
|
||||
computeMaxActiveApplications(clusterResource, maxAMResourcePercent,
|
||||
CSQueueUtils.computeMaxActiveApplications(clusterResource, maxAMResourcePercent,
|
||||
absoluteCapacity);
|
||||
maxActiveApplicationsPerUser =
|
||||
computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
|
||||
CSQueueUtils.computeMaxActiveApplicationsPerUser(maxActiveApplications, userLimit,
|
||||
userLimitFactor);
|
||||
|
||||
// Update application properties
|
||||
for (SchedulerApp application : activeApplications) {
|
||||
computeAndSetUserResourceLimit(application, clusterResource);
|
||||
computeAndSetUserResourceLimit(
|
||||
application, clusterResource, Resources.none());
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void updateResource(Resource clusterResource) {
|
||||
float queueLimit = clusterResource.getMemory() * absoluteCapacity;
|
||||
setUtilization(usedResources.getMemory() / queueLimit);
|
||||
setUsedCapacity(
|
||||
usedResources.getMemory() / (clusterResource.getMemory() * capacity));
|
||||
setUsedCapacity(usedResources.getMemory()
|
||||
/ (clusterResource.getMemory() * parent.getAbsoluteCapacity()));
|
||||
|
||||
Resource resourceLimit =
|
||||
Resources.createResource(roundUp((int)queueLimit));
|
||||
|
|
|
@ -118,16 +118,14 @@ public class ParentQueue implements CSQueue {
|
|||
}
|
||||
|
||||
float capacity = (float) rawCapacity / 100;
|
||||
|
||||
float parentAbsoluteCapacity =
|
||||
(parent == null) ? 1.0f : parent.getAbsoluteCapacity();
|
||||
(rootQueue) ? 1.0f : parent.getAbsoluteCapacity();
|
||||
float absoluteCapacity = parentAbsoluteCapacity * capacity;
|
||||
|
||||
float maximumCapacity =
|
||||
(float) cs.getConfiguration().getMaximumCapacity(getQueuePath()) / 100;
|
||||
float absoluteMaxCapacity =
|
||||
(Math.round(maximumCapacity * 100) == CapacitySchedulerConfiguration.UNDEFINED) ?
|
||||
Float.MAX_VALUE : (parentAbsoluteCapacity * maximumCapacity);
|
||||
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
|
||||
|
||||
QueueState state = cs.getConfiguration().getState(getQueuePath());
|
||||
|
||||
|
@ -333,10 +331,15 @@ public class ParentQueue implements CSQueue {
|
|||
}
|
||||
|
||||
public String toString() {
|
||||
return queueName + ":" + capacity + ":" + absoluteCapacity + ":" +
|
||||
getUsedCapacity() + ":" + getUtilization() + ":" +
|
||||
getNumApplications() + ":" + getNumContainers() + ":" +
|
||||
childQueues.size() + " child-queues";
|
||||
return queueName + ": " +
|
||||
"numChildQueue= " + childQueues.size() + ", " +
|
||||
"capacity=" + capacity + ", " +
|
||||
"absoluteCapacity=" + absoluteCapacity + ", " +
|
||||
"usedResources=" + usedResources.getMemory() + "MB, " +
|
||||
"usedCapacity=" + getUsedCapacity() + ", " +
|
||||
"utilization=" + getUtilization() + ", " +
|
||||
"numApps=" + getNumApplications() + ", " +
|
||||
"numContainers=" + getNumContainers();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -492,12 +495,8 @@ public class ParentQueue implements CSQueue {
|
|||
CSQueueUtils.checkMaxCapacity(getQueueName(), capacity, maximumCapacity);
|
||||
|
||||
this.maximumCapacity = maximumCapacity;
|
||||
float parentAbsoluteCapacity =
|
||||
(rootQueue) ? 100.0f : parent.getAbsoluteCapacity();
|
||||
this.absoluteMaxCapacity =
|
||||
(maximumCapacity == CapacitySchedulerConfiguration.UNDEFINED) ?
|
||||
Float.MAX_VALUE :
|
||||
(parentAbsoluteCapacity * maximumCapacity);
|
||||
CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -689,9 +688,11 @@ public class ParentQueue implements CSQueue {
|
|||
|
||||
private synchronized void updateResource(Resource clusterResource) {
|
||||
float queueLimit = clusterResource.getMemory() * absoluteCapacity;
|
||||
float parentAbsoluteCapacity =
|
||||
(rootQueue) ? 1.0f : parent.getAbsoluteCapacity();
|
||||
setUtilization(usedResources.getMemory() / queueLimit);
|
||||
setUsedCapacity(
|
||||
usedResources.getMemory() / (clusterResource.getMemory() * capacity));
|
||||
setUsedCapacity(usedResources.getMemory()
|
||||
/ (clusterResource.getMemory() * parentAbsoluteCapacity));
|
||||
|
||||
Resource resourceLimit =
|
||||
Resources.createResource((int)queueLimit);
|
||||
|
|
|
@ -358,7 +358,7 @@ public class FifoScheduler implements ResourceScheduler {
|
|||
}
|
||||
}
|
||||
|
||||
application.setAvailableResourceLimit(clusterResource);
|
||||
application.setHeadroom(clusterResource);
|
||||
|
||||
LOG.debug("post-assignContainers");
|
||||
application.showRequests();
|
||||
|
|
|
@ -21,16 +21,24 @@ import static org.junit.Assert.*;
|
|||
import static org.mockito.Mockito.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||
import org.apache.hadoop.yarn.api.records.Priority;
|
||||
import org.apache.hadoop.yarn.api.records.QueueACL;
|
||||
import org.apache.hadoop.yarn.api.records.Resource;
|
||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApp;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
|
||||
import org.junit.After;
|
||||
|
@ -283,38 +291,76 @@ public class TestApplicationLimits {
|
|||
final String user_0 = "user_0";
|
||||
final String user_1 = "user_1";
|
||||
|
||||
int APPLICATION_ID = 0;
|
||||
RecordFactory recordFactory =
|
||||
RecordFactoryProvider.getRecordFactory(null);
|
||||
RMContext rmContext = TestUtils.getMockRMContext();
|
||||
|
||||
// Submit first application from user_0, check headroom
|
||||
SchedulerApp app_0_0 = getMockApplication(APPLICATION_ID++, user_0);
|
||||
Priority priority_1 = TestUtils.createMockPriority(1);
|
||||
|
||||
// Submit first application with some resource-requests from user_0,
|
||||
// and check headroom
|
||||
final ApplicationAttemptId appAttemptId_0_0 =
|
||||
TestUtils.getMockApplicationAttemptId(0, 0);
|
||||
SchedulerApp app_0_0 =
|
||||
spy(new SchedulerApp(appAttemptId_0_0, user_0, queue, rmContext, null));
|
||||
queue.submitApplication(app_0_0, user_0, A);
|
||||
queue.assignContainers(clusterResource, node_0); // Schedule to compute
|
||||
|
||||
List<ResourceRequest> app_0_0_requests = new ArrayList<ResourceRequest>();
|
||||
app_0_0_requests.add(
|
||||
TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2,
|
||||
priority_1, recordFactory));
|
||||
app_0_0.updateResourceRequests(app_0_0_requests);
|
||||
|
||||
// Schedule to compute
|
||||
queue.assignContainers(clusterResource, node_0);
|
||||
Resource expectedHeadroom = Resources.createResource(10*16*GB);
|
||||
verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_0_0).setHeadroom(eq(expectedHeadroom));
|
||||
|
||||
// Submit second application from user_0, check headroom
|
||||
SchedulerApp app_0_1 = getMockApplication(APPLICATION_ID++, user_0);
|
||||
final ApplicationAttemptId appAttemptId_0_1 =
|
||||
TestUtils.getMockApplicationAttemptId(1, 0);
|
||||
SchedulerApp app_0_1 =
|
||||
spy(new SchedulerApp(appAttemptId_0_1, user_0, queue, rmContext, null));
|
||||
queue.submitApplication(app_0_1, user_0, A);
|
||||
|
||||
List<ResourceRequest> app_0_1_requests = new ArrayList<ResourceRequest>();
|
||||
app_0_1_requests.add(
|
||||
TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2,
|
||||
priority_1, recordFactory));
|
||||
app_0_1.updateResourceRequests(app_0_1_requests);
|
||||
|
||||
// Schedule to compute
|
||||
queue.assignContainers(clusterResource, node_0); // Schedule to compute
|
||||
verify(app_0_0, times(2)).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));// no change
|
||||
verify(app_0_0, times(2)).setHeadroom(eq(expectedHeadroom));
|
||||
verify(app_0_1).setHeadroom(eq(expectedHeadroom));// no change
|
||||
|
||||
// Submit first application from user_1, check for new headroom
|
||||
SchedulerApp app_1_0 = getMockApplication(APPLICATION_ID++, user_1);
|
||||
final ApplicationAttemptId appAttemptId_1_0 =
|
||||
TestUtils.getMockApplicationAttemptId(2, 0);
|
||||
SchedulerApp app_1_0 =
|
||||
spy(new SchedulerApp(appAttemptId_1_0, user_1, queue, rmContext, null));
|
||||
queue.submitApplication(app_1_0, user_1, A);
|
||||
|
||||
List<ResourceRequest> app_1_0_requests = new ArrayList<ResourceRequest>();
|
||||
app_1_0_requests.add(
|
||||
TestUtils.createResourceRequest(RMNodeImpl.ANY, 1*GB, 2,
|
||||
priority_1, recordFactory));
|
||||
app_1_0.updateResourceRequests(app_1_0_requests);
|
||||
|
||||
// Schedule to compute
|
||||
queue.assignContainers(clusterResource, node_0); // Schedule to compute
|
||||
expectedHeadroom = Resources.createResource(10*16*GB / 2); // changes
|
||||
verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_0_0).setHeadroom(eq(expectedHeadroom));
|
||||
verify(app_0_1).setHeadroom(eq(expectedHeadroom));
|
||||
verify(app_1_0).setHeadroom(eq(expectedHeadroom));
|
||||
|
||||
// Now reduce cluster size and check for the smaller headroom
|
||||
clusterResource = Resources.createResource(90*16*GB);
|
||||
queue.assignContainers(clusterResource, node_0); // Schedule to compute
|
||||
expectedHeadroom = Resources.createResource(9*16*GB / 2); // changes
|
||||
verify(app_0_0).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_0_1).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_1_0).setAvailableResourceLimit(eq(expectedHeadroom));
|
||||
verify(app_0_0).setHeadroom(eq(expectedHeadroom));
|
||||
verify(app_0_1).setHeadroom(eq(expectedHeadroom));
|
||||
verify(app_1_0).setHeadroom(eq(expectedHeadroom));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -255,7 +255,7 @@ public class TestLeafQueue {
|
|||
// Manipulate queue 'a'
|
||||
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
|
||||
//unset maxCapacity
|
||||
a.setMaxCapacity(-0.01f);
|
||||
a.setMaxCapacity(1.0f);
|
||||
|
||||
// Users
|
||||
final String user_0 = "user_0";
|
||||
|
@ -377,7 +377,7 @@ public class TestLeafQueue {
|
|||
// Mock the queue
|
||||
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
|
||||
//unset maxCapacity
|
||||
a.setMaxCapacity(-0.01f);
|
||||
a.setMaxCapacity(1.0f);
|
||||
|
||||
// Users
|
||||
final String user_0 = "user_0";
|
||||
|
@ -491,7 +491,7 @@ public class TestLeafQueue {
|
|||
|
||||
// Revert max-capacity and user-limit-factor
|
||||
// Now, allocations should goto app_3 since it's under user-limit
|
||||
a.setMaxCapacity(-0.01f);
|
||||
a.setMaxCapacity(1.0f);
|
||||
a.setUserLimitFactor(1);
|
||||
a.assignContainers(clusterResource, node_0);
|
||||
assertEquals(7*GB, a.getUsedResources().getMemory());
|
||||
|
@ -548,7 +548,7 @@ public class TestLeafQueue {
|
|||
// Manipulate queue 'a'
|
||||
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
|
||||
//unset maxCapacity
|
||||
a.setMaxCapacity(-0.01f);
|
||||
a.setMaxCapacity(1.0f);
|
||||
|
||||
// Users
|
||||
final String user_0 = "user_0";
|
||||
|
@ -571,7 +571,7 @@ public class TestLeafQueue {
|
|||
String host_0 = "host_0";
|
||||
SchedulerNode node_0 = TestUtils.getMockNode(host_0, DEFAULT_RACK, 0, 4*GB);
|
||||
|
||||
final int numNodes = 1;
|
||||
final int numNodes = 2;
|
||||
Resource clusterResource = Resources.createResource(numNodes * (4*GB));
|
||||
when(csContext.getNumClusterNodes()).thenReturn(numNodes);
|
||||
|
||||
|
@ -646,7 +646,7 @@ public class TestLeafQueue {
|
|||
// Manipulate queue 'a'
|
||||
LeafQueue a = stubLeafQueue((LeafQueue)queues.get(A));
|
||||
//unset maxCapacity
|
||||
a.setMaxCapacity(-0.01f);
|
||||
a.setMaxCapacity(1.0f);
|
||||
a.setUserLimitFactor(10);
|
||||
|
||||
// Users
|
||||
|
@ -673,7 +673,7 @@ public class TestLeafQueue {
|
|||
String host_1 = "host_1";
|
||||
SchedulerNode node_1 = TestUtils.getMockNode(host_1, DEFAULT_RACK, 0, 4*GB);
|
||||
|
||||
final int numNodes = 2;
|
||||
final int numNodes = 3;
|
||||
Resource clusterResource = Resources.createResource(numNodes * (4*GB));
|
||||
when(csContext.getNumClusterNodes()).thenReturn(numNodes);
|
||||
when(csContext.getMaximumResourceCapability()).thenReturn(
|
||||
|
|
|
@ -138,12 +138,34 @@ public class TestParentQueue {
|
|||
when(queue).assignContainers(eq(clusterResource), eq(node));
|
||||
}
|
||||
|
||||
private float computeQueueUsedCapacity(CSQueue queue,
|
||||
int expectedMemory, Resource clusterResource) {
|
||||
return (
|
||||
((float)expectedMemory / clusterResource.getMemory()) *
|
||||
queue.getParent().getAbsoluteCapacity()
|
||||
);
|
||||
}
|
||||
|
||||
private float computeQueueUtilization(CSQueue queue,
|
||||
int expectedMemory, Resource clusterResource) {
|
||||
return (expectedMemory /
|
||||
(clusterResource.getMemory() * queue.getAbsoluteCapacity()));
|
||||
}
|
||||
|
||||
final static float DELTA = 0.0001f;
|
||||
private void verifyQueueMetrics(CSQueue queue,
|
||||
int expectedMemory, Resource clusterResource) {
|
||||
assertEquals(
|
||||
computeQueueUtilization(queue, expectedMemory, clusterResource),
|
||||
queue.getUtilization(),
|
||||
DELTA);
|
||||
assertEquals(
|
||||
computeQueueUsedCapacity(queue, expectedMemory, clusterResource),
|
||||
queue.getUsedCapacity(),
|
||||
DELTA);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleLevelQueues() throws Exception {
|
||||
// Setup queue configs
|
||||
|
@ -173,15 +195,13 @@ public class TestParentQueue {
|
|||
// Start testing
|
||||
LeafQueue a = (LeafQueue)queues.get(A);
|
||||
LeafQueue b = (LeafQueue)queues.get(B);
|
||||
final float delta = 0.0001f;
|
||||
|
||||
// Simulate B returning a container on node_0
|
||||
stubQueueAllocation(a, clusterResource, node_0, 0*GB);
|
||||
stubQueueAllocation(b, clusterResource, node_0, 1*GB);
|
||||
root.assignContainers(clusterResource, node_0);
|
||||
assertEquals(0.0f, a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 1*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 0*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 1*GB, clusterResource);
|
||||
|
||||
// Now, A should get the scheduling opportunity since A=0G/6G, B=1G/14G
|
||||
stubQueueAllocation(a, clusterResource, node_1, 2*GB);
|
||||
|
@ -192,10 +212,8 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(b).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 2*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 2*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 2*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 2*GB, clusterResource);
|
||||
|
||||
// Now, B should get the scheduling opportunity
|
||||
// since A has 2/6G while B has 2/14G
|
||||
|
@ -207,10 +225,8 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(a).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 3*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 4*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 3*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 4*GB, clusterResource);
|
||||
|
||||
// Now, B should still get the scheduling opportunity
|
||||
// since A has 3/6G while B has 4/14G
|
||||
|
@ -222,10 +238,8 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(a).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 3*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 8*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 3*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 8*GB, clusterResource);
|
||||
|
||||
// Now, A should get the scheduling opportunity
|
||||
// since A has 3/6G while B has 8/14G
|
||||
|
@ -237,10 +251,8 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(a).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 4*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 9*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 4*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 9*GB, clusterResource);
|
||||
}
|
||||
|
||||
private static final String C = "c";
|
||||
|
@ -323,22 +335,16 @@ public class TestParentQueue {
|
|||
CSQueue b2 = queues.get(B2);
|
||||
CSQueue b3 = queues.get(B3);
|
||||
|
||||
final float delta = 0.0001f;
|
||||
|
||||
// Simulate C returning a container on node_0
|
||||
stubQueueAllocation(a, clusterResource, node_0, 0*GB);
|
||||
stubQueueAllocation(b, clusterResource, node_0, 0*GB);
|
||||
stubQueueAllocation(c, clusterResource, node_0, 1*GB);
|
||||
stubQueueAllocation(d, clusterResource, node_0, 0*GB);
|
||||
root.assignContainers(clusterResource, node_0);
|
||||
assertEquals(computeQueueUtilization(a, 0*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 0*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(c, 1*GB, clusterResource),
|
||||
c.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(d, 0*GB, clusterResource),
|
||||
d.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 0*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 0*GB, clusterResource);
|
||||
verifyQueueMetrics(c, 1*GB, clusterResource);
|
||||
verifyQueueMetrics(d, 0*GB, clusterResource);
|
||||
reset(a); reset(b); reset(c);
|
||||
|
||||
// Now get B2 to allocate
|
||||
|
@ -347,12 +353,9 @@ public class TestParentQueue {
|
|||
stubQueueAllocation(b2, clusterResource, node_1, 4*GB);
|
||||
stubQueueAllocation(c, clusterResource, node_1, 0*GB);
|
||||
root.assignContainers(clusterResource, node_1);
|
||||
assertEquals(computeQueueUtilization(a, 0*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 4*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(c, 1*GB, clusterResource),
|
||||
c.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 0*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 4*GB, clusterResource);
|
||||
verifyQueueMetrics(c, 1*GB, clusterResource);
|
||||
reset(a); reset(b); reset(c);
|
||||
|
||||
// Now get both A1, C & B3 to allocate in right order
|
||||
|
@ -368,12 +371,9 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(b).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 1*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 6*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(c, 3*GB, clusterResource),
|
||||
c.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 1*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 6*GB, clusterResource);
|
||||
verifyQueueMetrics(c, 3*GB, clusterResource);
|
||||
reset(a); reset(b); reset(c);
|
||||
|
||||
// Now verify max-capacity
|
||||
|
@ -399,14 +399,10 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(c).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 3*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 8*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(c, 4*GB, clusterResource),
|
||||
c.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 3*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 8*GB, clusterResource);
|
||||
verifyQueueMetrics(c, 4*GB, clusterResource);
|
||||
reset(a); reset(b); reset(c);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -438,15 +434,13 @@ public class TestParentQueue {
|
|||
// Start testing
|
||||
LeafQueue a = (LeafQueue)queues.get(A);
|
||||
LeafQueue b = (LeafQueue)queues.get(B);
|
||||
final float delta = 0.0001f;
|
||||
|
||||
// Simulate B returning a container on node_0
|
||||
stubQueueAllocation(a, clusterResource, node_0, 0*GB, NodeType.OFF_SWITCH);
|
||||
stubQueueAllocation(b, clusterResource, node_0, 1*GB, NodeType.OFF_SWITCH);
|
||||
root.assignContainers(clusterResource, node_0);
|
||||
assertEquals(0.0f, a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 1*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 0*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 1*GB, clusterResource);
|
||||
|
||||
// Now, A should get the scheduling opportunity since A=0G/6G, B=1G/14G
|
||||
// also, B gets a scheduling opportunity since A allocates RACK_LOCAL
|
||||
|
@ -458,10 +452,8 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(b).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 2*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 2*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 2*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 2*GB, clusterResource);
|
||||
|
||||
// Now, B should get the scheduling opportunity
|
||||
// since A has 2/6G while B has 2/14G,
|
||||
|
@ -474,10 +466,8 @@ public class TestParentQueue {
|
|||
any(SchedulerNode.class));
|
||||
allocationOrder.verify(a).assignContainers(eq(clusterResource),
|
||||
any(SchedulerNode.class));
|
||||
assertEquals(computeQueueUtilization(a, 2*GB, clusterResource),
|
||||
a.getUtilization(), delta);
|
||||
assertEquals(computeQueueUtilization(b, 4*GB, clusterResource),
|
||||
b.getUtilization(), delta);
|
||||
verifyQueueMetrics(a, 2*GB, clusterResource);
|
||||
verifyQueueMetrics(b, 4*GB, clusterResource);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -30,6 +30,8 @@ public class TestQueueParsing {
|
|||
|
||||
private static final Log LOG = LogFactory.getLog(TestQueueParsing.class);
|
||||
|
||||
private static final double DELTA = 0.000001;
|
||||
|
||||
@Test
|
||||
public void testQueueParsing() throws Exception {
|
||||
CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
|
||||
|
@ -37,6 +39,20 @@ public class TestQueueParsing {
|
|||
|
||||
CapacityScheduler capacityScheduler = new CapacityScheduler();
|
||||
capacityScheduler.reinitialize(conf, null, null);
|
||||
|
||||
CSQueue a = capacityScheduler.getQueue("a");
|
||||
Assert.assertEquals(0.10, a.getAbsoluteCapacity(), DELTA);
|
||||
Assert.assertEquals(0.15, a.getAbsoluteMaximumCapacity(), DELTA);
|
||||
|
||||
CSQueue b1 = capacityScheduler.getQueue("b1");
|
||||
Assert.assertEquals(0.2 * 0.5, b1.getAbsoluteCapacity(), DELTA);
|
||||
Assert.assertEquals("Parent B has no MAX_CAP",
|
||||
0.85, b1.getAbsoluteMaximumCapacity(), DELTA);
|
||||
|
||||
CSQueue c12 = capacityScheduler.getQueue("c12");
|
||||
Assert.assertEquals(0.7 * 0.5 * 0.45, c12.getAbsoluteCapacity(), DELTA);
|
||||
Assert.assertEquals(0.7 * 0.55 * 0.7,
|
||||
c12.getAbsoluteMaximumCapacity(), DELTA);
|
||||
}
|
||||
|
||||
private void setupQueueConfiguration(CapacitySchedulerConfiguration conf) {
|
||||
|
@ -47,12 +63,14 @@ public class TestQueueParsing {
|
|||
|
||||
final String A = CapacitySchedulerConfiguration.ROOT + ".a";
|
||||
conf.setCapacity(A, 10);
|
||||
conf.setMaximumCapacity(A, 15);
|
||||
|
||||
final String B = CapacitySchedulerConfiguration.ROOT + ".b";
|
||||
conf.setCapacity(B, 20);
|
||||
|
||||
final String C = CapacitySchedulerConfiguration.ROOT + ".c";
|
||||
conf.setCapacity(C, 70);
|
||||
conf.setMaximumCapacity(C, 70);
|
||||
|
||||
LOG.info("Setup top-level queues");
|
||||
|
||||
|
@ -61,15 +79,20 @@ public class TestQueueParsing {
|
|||
final String A2 = A + ".a2";
|
||||
conf.setQueues(A, new String[] {"a1", "a2"});
|
||||
conf.setCapacity(A1, 30);
|
||||
conf.setMaximumCapacity(A1, 45);
|
||||
conf.setCapacity(A2, 70);
|
||||
conf.setMaximumCapacity(A2, 85);
|
||||
|
||||
final String B1 = B + ".b1";
|
||||
final String B2 = B + ".b2";
|
||||
final String B3 = B + ".b3";
|
||||
conf.setQueues(B, new String[] {"b1", "b2", "b3"});
|
||||
conf.setCapacity(B1, 50);
|
||||
conf.setMaximumCapacity(B1, 85);
|
||||
conf.setCapacity(B2, 30);
|
||||
conf.setMaximumCapacity(B2, 35);
|
||||
conf.setCapacity(B3, 20);
|
||||
conf.setMaximumCapacity(B3, 35);
|
||||
|
||||
final String C1 = C + ".c1";
|
||||
final String C2 = C + ".c2";
|
||||
|
@ -77,9 +100,13 @@ public class TestQueueParsing {
|
|||
final String C4 = C + ".c4";
|
||||
conf.setQueues(C, new String[] {"c1", "c2", "c3", "c4"});
|
||||
conf.setCapacity(C1, 50);
|
||||
conf.setMaximumCapacity(C1, 55);
|
||||
conf.setCapacity(C2, 10);
|
||||
conf.setMaximumCapacity(C2, 25);
|
||||
conf.setCapacity(C3, 35);
|
||||
conf.setMaximumCapacity(C3, 38);
|
||||
conf.setCapacity(C4, 5);
|
||||
conf.setMaximumCapacity(C4, 5);
|
||||
|
||||
LOG.info("Setup 2nd-level queues");
|
||||
|
||||
|
@ -89,8 +116,11 @@ public class TestQueueParsing {
|
|||
final String C13 = C1 + ".c13";
|
||||
conf.setQueues(C1, new String[] {"c11", "c12", "c13"});
|
||||
conf.setCapacity(C11, 15);
|
||||
conf.setMaximumCapacity(C11, 30);
|
||||
conf.setCapacity(C12, 45);
|
||||
conf.setMaximumCapacity(C12, 70);
|
||||
conf.setCapacity(C13, 40);
|
||||
conf.setMaximumCapacity(C13, 40);
|
||||
|
||||
LOG.info("Setup 3rd-level queues");
|
||||
}
|
||||
|
|
|
@ -235,12 +235,13 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
Element qElem = (Element) queues.item(j);
|
||||
String qName = WebServicesTestUtils.getXmlString(qElem, "queueName");
|
||||
String q = CapacitySchedulerConfiguration.ROOT + "." + qName;
|
||||
verifySubQueueXML(qElem, q, 100);
|
||||
verifySubQueueXML(qElem, q, 100, 100);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void verifySubQueueXML(Element qElem, String q, float parentAbsCapacity)
|
||||
public void verifySubQueueXML(Element qElem, String q,
|
||||
float parentAbsCapacity, float parentAbsMaxCapacity)
|
||||
throws Exception {
|
||||
NodeList queues = qElem.getElementsByTagName("subQueues");
|
||||
QueueInfo qi = (queues != null) ? new QueueInfo() : new LeafQueueInfo();
|
||||
|
@ -258,14 +259,15 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
WebServicesTestUtils.getXmlString(qElem, "usedResources");
|
||||
qi.queueName = WebServicesTestUtils.getXmlString(qElem, "queueName");
|
||||
qi.state = WebServicesTestUtils.getXmlString(qElem, "state");
|
||||
verifySubQueueGeneric(q, qi, parentAbsCapacity);
|
||||
verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
|
||||
|
||||
if (queues != null) {
|
||||
for (int j = 0; j < queues.getLength(); j++) {
|
||||
Element subqElem = (Element) queues.item(j);
|
||||
String qName = WebServicesTestUtils.getXmlString(subqElem, "queueName");
|
||||
String q2 = q + "." + qName;
|
||||
verifySubQueueXML(subqElem, q2, qi.absoluteCapacity);
|
||||
verifySubQueueXML(subqElem, q2,
|
||||
qi.absoluteCapacity, qi.absoluteMaxCapacity);
|
||||
}
|
||||
} else {
|
||||
LeafQueueInfo lqi = (LeafQueueInfo) qi;
|
||||
|
@ -309,7 +311,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
for (int i = 0; i < arr.length(); i++) {
|
||||
JSONObject obj = arr.getJSONObject(i);
|
||||
String q = CapacitySchedulerConfiguration.ROOT + "." + obj.getString("queueName");
|
||||
verifySubQueue(obj, q, 100);
|
||||
verifySubQueue(obj, q, 100, 100);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -323,7 +325,8 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
assertTrue("queueName doesn't match", "root".matches(queueName));
|
||||
}
|
||||
|
||||
private void verifySubQueue(JSONObject info, String q, float parentAbsCapacity)
|
||||
private void verifySubQueue(JSONObject info, String q,
|
||||
float parentAbsCapacity, float parentAbsMaxCapacity)
|
||||
throws JSONException, Exception {
|
||||
int numExpectedElements = 11;
|
||||
boolean isParentQueue = true;
|
||||
|
@ -345,7 +348,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
qi.queueName = info.getString("queueName");
|
||||
qi.state = info.getString("state");
|
||||
|
||||
verifySubQueueGeneric(q, qi, parentAbsCapacity);
|
||||
verifySubQueueGeneric(q, qi, parentAbsCapacity, parentAbsMaxCapacity);
|
||||
|
||||
if (isParentQueue) {
|
||||
JSONArray arr = info.getJSONArray("subQueues");
|
||||
|
@ -353,7 +356,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
for (int i = 0; i < arr.length(); i++) {
|
||||
JSONObject obj = arr.getJSONObject(i);
|
||||
String q2 = q + "." + obj.getString("queueName");
|
||||
verifySubQueue(obj, q2, qi.absoluteCapacity);
|
||||
verifySubQueue(obj, q2, qi.absoluteCapacity, qi.absoluteMaxCapacity);
|
||||
}
|
||||
} else {
|
||||
LeafQueueInfo lqi = (LeafQueueInfo) qi;
|
||||
|
@ -371,7 +374,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
}
|
||||
|
||||
private void verifySubQueueGeneric(String q, QueueInfo info,
|
||||
float parentAbsCapacity) throws Exception {
|
||||
float parentAbsCapacity, float parentAbsMaxCapacity) throws Exception {
|
||||
String[] qArr = q.split("\\.");
|
||||
assertTrue("q name invalid: " + q, qArr.length > 1);
|
||||
String qshortName = qArr[qArr.length - 1];
|
||||
|
@ -380,7 +383,7 @@ public class TestRMWebServicesCapacitySched extends JerseyTest {
|
|||
assertEquals("capacity doesn't match", csConf.getCapacity(q),
|
||||
info.capacity, 1e-3f);
|
||||
float expectCapacity = csConf.getMaximumCapacity(q);
|
||||
float expectAbsMaxCapacity = parentAbsCapacity * (info.maxCapacity/100);
|
||||
float expectAbsMaxCapacity = parentAbsMaxCapacity * (info.maxCapacity/100);
|
||||
if (CapacitySchedulerConfiguration.UNDEFINED == expectCapacity) {
|
||||
expectCapacity = 100;
|
||||
expectAbsMaxCapacity = 100;
|
||||
|
|
|
@ -57,7 +57,7 @@ public class AmIpFilter implements Filter {
|
|||
proxyUriBase = conf.getInitParameter(PROXY_URI_BASE);
|
||||
}
|
||||
|
||||
private Set<String> getProxyAddresses() throws ServletException {
|
||||
protected Set<String> getProxyAddresses() throws ServletException {
|
||||
long now = System.currentTimeMillis();
|
||||
synchronized(this) {
|
||||
if(proxyAddresses == null || (lastUpdate + updateInterval) >= now) {
|
||||
|
@ -97,12 +97,15 @@ public class AmIpFilter implements Filter {
|
|||
}
|
||||
|
||||
String user = null;
|
||||
|
||||
if (httpReq.getCookies() != null) {
|
||||
for(Cookie c: httpReq.getCookies()) {
|
||||
if(WebAppProxyServlet.PROXY_USER_COOKIE_NAME.equals(c.getName())){
|
||||
user = c.getValue();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(user == null) {
|
||||
LOG.warn("Could not find "+WebAppProxyServlet.PROXY_USER_COOKIE_NAME
|
||||
+" cookie, so user will not be set");
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.server.webproxy.amfilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import javax.servlet.Filter;
|
||||
import javax.servlet.FilterChain;
|
||||
import javax.servlet.FilterConfig;
|
||||
import javax.servlet.ServletContext;
|
||||
import javax.servlet.ServletException;
|
||||
import javax.servlet.ServletRequest;
|
||||
import javax.servlet.ServletResponse;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
|
||||
public class TestAmFilter {
|
||||
|
||||
private String proxyHost = "bogushost.com";
|
||||
private String proxyUri = "http://bogus";
|
||||
|
||||
private class TestAmIpFilter extends AmIpFilter {
|
||||
|
||||
private Set<String> proxyAddresses = null;
|
||||
|
||||
protected Set<String> getProxyAddresses() {
|
||||
if(proxyAddresses == null) {
|
||||
proxyAddresses = new HashSet<String>();
|
||||
}
|
||||
proxyAddresses.add(proxyHost);
|
||||
return proxyAddresses;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static class DummyFilterConfig implements FilterConfig {
|
||||
final Map<String, String> map;
|
||||
|
||||
|
||||
DummyFilterConfig(Map<String,String> map) {
|
||||
this.map = map;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getFilterName() {
|
||||
return "dummy";
|
||||
}
|
||||
@Override
|
||||
public String getInitParameter(String arg0) {
|
||||
return map.get(arg0);
|
||||
}
|
||||
@Override
|
||||
public Enumeration<String> getInitParameterNames() {
|
||||
return Collections.enumeration(map.keySet());
|
||||
}
|
||||
@Override
|
||||
public ServletContext getServletContext() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void filterNullCookies() throws Exception {
|
||||
HttpServletRequest request = Mockito.mock(HttpServletRequest.class);
|
||||
|
||||
Mockito.when(request.getCookies()).thenReturn(null);
|
||||
Mockito.when(request.getRemoteAddr()).thenReturn(proxyHost);
|
||||
|
||||
HttpServletResponse response = Mockito.mock(HttpServletResponse.class);
|
||||
|
||||
final AtomicBoolean invoked = new AtomicBoolean();
|
||||
|
||||
FilterChain chain = new FilterChain() {
|
||||
@Override
|
||||
public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse)
|
||||
throws IOException, ServletException {
|
||||
invoked.set(true);
|
||||
}
|
||||
};
|
||||
|
||||
Map<String, String> params = new HashMap<String, String>();
|
||||
params.put(AmIpFilter.PROXY_HOST, proxyHost);
|
||||
params.put(AmIpFilter.PROXY_URI_BASE, proxyUri);
|
||||
FilterConfig conf = new DummyFilterConfig(params);
|
||||
Filter filter = new TestAmIpFilter();
|
||||
filter.init(conf);
|
||||
filter.doFilter(request, response, chain);
|
||||
Assert.assertTrue(invoked.get());
|
||||
filter.destroy();
|
||||
}
|
||||
}
|
|
@ -95,7 +95,7 @@ Hadoop MapReduce Next Generation - Cluster Setup
|
|||
*--------------------------------------+--------------------------------------+
|
||||
| DataNode | HADOOP_DATANODE_OPTS |
|
||||
*--------------------------------------+--------------------------------------+
|
||||
| Backup NameNode | HADOOP_SECONDARYNAMENODE_OPTS |
|
||||
| Secondary NameNode | HADOOP_SECONDARYNAMENODE_OPTS |
|
||||
*--------------------------------------+--------------------------------------+
|
||||
| ResourceManager | YARN_RESOURCEMANAGER_OPTS |
|
||||
*--------------------------------------+--------------------------------------+
|
||||
|
@ -537,15 +537,15 @@ Hadoop MapReduce Next Generation - Cluster Setup
|
|||
|
||||
It's recommended to have them share a Unix group, for e.g. <<<hadoop>>>.
|
||||
|
||||
*--------------------------------------+--------------------------------------+
|
||||
*--------------------------------------+----------------------------------------------------------------------+
|
||||
|| User:Group || Daemons |
|
||||
*--------------------------------------+--------------------------------------+
|
||||
| hdfs:hadoop | NameNode, Backup NameNode, DataNode |
|
||||
*--------------------------------------+--------------------------------------+
|
||||
*--------------------------------------+----------------------------------------------------------------------+
|
||||
| hdfs:hadoop | NameNode, Secondary NameNode, Checkpoint Node, Backup Node, DataNode |
|
||||
*--------------------------------------+----------------------------------------------------------------------+
|
||||
| yarn:hadoop | ResourceManager, NodeManager |
|
||||
*--------------------------------------+--------------------------------------+
|
||||
*--------------------------------------+----------------------------------------------------------------------+
|
||||
| mapred:hadoop | MapReduce JobHistory Server |
|
||||
*--------------------------------------+--------------------------------------+
|
||||
*--------------------------------------+----------------------------------------------------------------------+
|
||||
|
||||
* <<<Permissions for both HDFS and local fileSystem paths>>>
|
||||
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
~~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||
~~ you may not use this file except in compliance with the License.
|
||||
~~ You may obtain a copy of the License at
|
||||
~~
|
||||
~~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~~
|
||||
~~ Unless required by applicable law or agreed to in writing, software
|
||||
~~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~~ See the License for the specific language governing permissions and
|
||||
~~ limitations under the License. See accompanying LICENSE file.
|
||||
|
||||
---
|
||||
YARN
|
||||
---
|
||||
---
|
||||
${maven.build.timestamp}
|
||||
|
||||
Web Application Proxy
|
||||
|
||||
The Web Application Proxy is part of YARN. By default it will run as part of
|
||||
the Resource Manager(RM), but can be configured to run in stand alone mode.
|
||||
The reason for the proxy is to reduce the possibility of web based attacks
|
||||
through YARN.
|
||||
|
||||
In YARN the Application Master(AM) has the responsibility to provide a web UI
|
||||
and to send that link to the RM. This opens up a number of potential
|
||||
issues. The RM runs as a trusted user, and people visiting that web
|
||||
address will treat it, and links it provides to them as trusted, when in
|
||||
reality the AM is running as a non-trusted user, and the links it gives to
|
||||
the RM could point to anything malicious or otherwise. The Web Application
|
||||
Proxy mitigates this risk by warning users that do not own the given
|
||||
application that they are connecting to an untrusted site.
|
||||
|
||||
In addition to this the proxy also tries to reduce the impact that a malicious
|
||||
AM could have on a user. It primarily does this by stripping out cookies from
|
||||
the user, and replacing them with a single cookie providing the user name of
|
||||
the logged in user. This is because most web based authentication systems will
|
||||
identify a user based off of a cookie. By providing this cookie to an
|
||||
untrusted application it opens up the potential for an exploit. If the cookie
|
||||
is designed properly that potential should be fairly minimal, but this is just
|
||||
to reduce that potential attack vector. The current proxy implementation does
|
||||
nothing to prevent the AM from providing links to malicious external sites,
|
||||
nor does it do anything to prevent malicious javascript code from running as
|
||||
well. In fact javascript can be used to get the cookies, so stripping the
|
||||
cookies from the request has minimal benefit at this time.
|
||||
|
||||
In the future we hope to address the attack vectors described above and make
|
||||
attaching to an AM's web UI safer.
|
|
@ -47,4 +47,6 @@ MapReduce NextGen aka YARN aka MRv2
|
|||
|
||||
* {{{./CapacityScheduler.html}Capacity Scheduler}}
|
||||
|
||||
* {{{./WebApplicationProxy.html}Web Application Proxy}}
|
||||
|
||||
|
||||
|
|
|
@ -223,6 +223,11 @@
|
|||
<artifactId>hadoop-archives</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-distcp</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-rumen</artifactId>
|
||||
|
@ -709,11 +714,21 @@
|
|||
<artifactId>maven-project-info-reports-plugin</artifactId>
|
||||
<version>2.4</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>2.2</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
<version>1.2</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pdf-plugin</artifactId>
|
||||
<version>1.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
|
||||
|
@ -811,6 +826,14 @@
|
|||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pdf-plugin</artifactId>
|
||||
<configuration>
|
||||
<outputDirectory>${project.reporting.outputDirectory}</outputDirectory>
|
||||
<includeReports>false</includeReports>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
|
|
@ -61,6 +61,7 @@
|
|||
<item name="YARN Architecture" href="hadoop-yarn/hadoop-yarn-site/YARN.html"/>
|
||||
<item name="Writing Yarn Applications" href="hadoop-yarn/hadoop-yarn-site/WritingYarnApplications.html"/>
|
||||
<item name="Capacity Scheduler" href="hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html"/>
|
||||
<item name="Web Application Proxy" href="hadoop-yarn/hadoop-yarn-site/WebApplicationProxy.html"/>
|
||||
</menu>
|
||||
|
||||
<menu name="YARN REST API's" inherit="top">
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
DistCp (distributed copy) is a tool used for large inter/intra-cluster copying.
|
||||
It uses Map/Reduce to effect its distribution, error handling and recovery,
|
||||
and reporting. It expands a list of files and directories into input to map tasks,
|
||||
each of which will copy a partition of the files specified in the source list.
|
||||
|
||||
Version 0.1 (2010/08/02 sriksun)
|
||||
- Initial Version
|
|
@ -0,0 +1,198 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
<project>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-project</artifactId>
|
||||
<version>0.23.1-SNAPSHOT</version>
|
||||
<relativePath>../../hadoop-project</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-distcp</artifactId>
|
||||
<version>0.23.1-SNAPSHOT</version>
|
||||
<description>Apache Hadoop Distributed Copy</description>
|
||||
<name>Apache Hadoop Distributed Copy</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<properties>
|
||||
<file.encoding>UTF-8</file.encoding>
|
||||
<downloadSources>true</downloadSources>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-annotations</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-app</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-hs</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-core</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs</artifactId>
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
<filtering>true</filtering>
|
||||
</resource>
|
||||
</resources>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>src/test/resources</directory>
|
||||
<filtering>true</filtering>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<forkMode>always</forkMode>
|
||||
<forkedProcessTimeoutInSeconds>600</forkedProcessTimeoutInSeconds>
|
||||
<argLine>-Xmx1024m</argLine>
|
||||
<includes>
|
||||
<include>**/Test*.java</include>
|
||||
</includes>
|
||||
<redirectTestOutputToFile>true</redirectTestOutputToFile>
|
||||
<systemProperties>
|
||||
<property>
|
||||
<name>test.build.data</name>
|
||||
<value>${basedir}/target/test/data</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hadoop.log.dir</name>
|
||||
<value>target/test/logs</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>org.apache.commons.logging.Log</name>
|
||||
<value>org.apache.commons.logging.impl.SimpleLog</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>org.apache.commons.logging.simplelog.defaultlog</name>
|
||||
<value>warn</value>
|
||||
</property>
|
||||
</systemProperties>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/lib</outputDirectory>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-checkstyle-plugin</artifactId>
|
||||
<configuration>
|
||||
<enableRulesSummary>true</enableRulesSummary>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<mainClass>org.apache.hadoop.tools.DistCp</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<configuration>
|
||||
<attach>true</attach>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pdf-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>pdf</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>pdf</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,218 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* The CopyListing abstraction is responsible for how the list of
|
||||
* sources and targets is constructed, for DistCp's copy function.
|
||||
* The copy-listing should be a SequenceFile<Text, FileStatus>,
|
||||
* located at the path specified to buildListing(),
|
||||
* each entry being a pair of (Source relative path, source file status),
|
||||
* all the paths being fully qualified.
|
||||
*/
|
||||
public abstract class CopyListing extends Configured {
|
||||
|
||||
private Credentials credentials;
|
||||
|
||||
/**
|
||||
* Build listing function creates the input listing that distcp uses to
|
||||
* perform the copy.
|
||||
*
|
||||
* The build listing is a sequence file that has relative path of a file in the key
|
||||
* and the file status information of the source file in the value
|
||||
*
|
||||
* For instance if the source path is /tmp/data and the traversed path is
|
||||
* /tmp/data/dir1/dir2/file1, then the sequence file would contain
|
||||
*
|
||||
* key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
|
||||
*
|
||||
* File would also contain directory entries. Meaning, if /tmp/data/dir1/dir2/file1
|
||||
* is the only file under /tmp/data, the resulting sequence file would contain the
|
||||
* following entries
|
||||
*
|
||||
* key: /dir1 and value: FileStatus(/tmp/data/dir1)
|
||||
* key: /dir1/dir2 and value: FileStatus(/tmp/data/dir1/dir2)
|
||||
* key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
|
||||
*
|
||||
* Cases requiring special handling:
|
||||
* If source path is a file (/tmp/file1), contents of the file will be as follows
|
||||
*
|
||||
* TARGET DOES NOT EXIST: Key-"", Value-FileStatus(/tmp/file1)
|
||||
* TARGET IS FILE : Key-"", Value-FileStatus(/tmp/file1)
|
||||
* TARGET IS DIR : Key-"/file1", Value-FileStatus(/tmp/file1)
|
||||
*
|
||||
* @param pathToListFile - Output file where the listing would be stored
|
||||
* @param options - Input options to distcp
|
||||
* @throws IOException - Exception if any
|
||||
*/
|
||||
public final void buildListing(Path pathToListFile,
|
||||
DistCpOptions options) throws IOException {
|
||||
validatePaths(options);
|
||||
doBuildListing(pathToListFile, options);
|
||||
Configuration config = getConf();
|
||||
|
||||
config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, pathToListFile.toString());
|
||||
config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, getBytesToCopy());
|
||||
config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths());
|
||||
|
||||
checkForDuplicates(pathToListFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate input and output paths
|
||||
*
|
||||
* @param options - Input options
|
||||
* @throws InvalidInputException: If inputs are invalid
|
||||
* @throws IOException: any Exception with FS
|
||||
*/
|
||||
protected abstract void validatePaths(DistCpOptions options)
|
||||
throws IOException, InvalidInputException;
|
||||
|
||||
/**
|
||||
* The interface to be implemented by sub-classes, to create the source/target file listing.
|
||||
* @param pathToListFile Path on HDFS where the listing file is written.
|
||||
* @param options Input Options for DistCp (indicating source/target paths.)
|
||||
* @throws IOException: Thrown on failure to create the listing file.
|
||||
*/
|
||||
protected abstract void doBuildListing(Path pathToListFile,
|
||||
DistCpOptions options) throws IOException;
|
||||
|
||||
/**
|
||||
* Return the total bytes that distCp should copy for the source paths
|
||||
* This doesn't consider whether file is same should be skipped during copy
|
||||
*
|
||||
* @return total bytes to copy
|
||||
*/
|
||||
protected abstract long getBytesToCopy();
|
||||
|
||||
/**
|
||||
* Return the total number of paths to distcp, includes directories as well
|
||||
* This doesn't consider whether file/dir is already present and should be skipped during copy
|
||||
*
|
||||
* @return Total number of paths to distcp
|
||||
*/
|
||||
protected abstract long getNumberOfPaths();
|
||||
|
||||
/**
|
||||
* Validate the final resulting path listing to see if there are any duplicate entries
|
||||
*
|
||||
* @param pathToListFile - path listing build by doBuildListing
|
||||
* @throws IOException - Any issues while checking for duplicates and throws
|
||||
* @throws DuplicateFileException - if there are duplicates
|
||||
*/
|
||||
private void checkForDuplicates(Path pathToListFile)
|
||||
throws DuplicateFileException, IOException {
|
||||
|
||||
Configuration config = getConf();
|
||||
FileSystem fs = pathToListFile.getFileSystem(config);
|
||||
|
||||
Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile);
|
||||
|
||||
SequenceFile.Reader reader = new SequenceFile.Reader(
|
||||
config, SequenceFile.Reader.file(sortedList));
|
||||
try {
|
||||
Text lastKey = new Text("*"); //source relative path can never hold *
|
||||
FileStatus lastFileStatus = new FileStatus();
|
||||
|
||||
Text currentKey = new Text();
|
||||
while (reader.next(currentKey)) {
|
||||
if (currentKey.equals(lastKey)) {
|
||||
FileStatus currentFileStatus = new FileStatus();
|
||||
reader.getCurrentValue(currentFileStatus);
|
||||
throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " +
|
||||
currentFileStatus.getPath() + " would cause duplicates. Aborting");
|
||||
}
|
||||
reader.getCurrentValue(lastFileStatus);
|
||||
lastKey.set(currentKey);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeStream(reader);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Protected constructor, to initialize configuration.
|
||||
* @param configuration The input configuration,
|
||||
* with which the source/target FileSystems may be accessed.
|
||||
* @param credentials - Credentials object on which the FS delegation tokens are cached.If null
|
||||
* delegation token caching is skipped
|
||||
*/
|
||||
protected CopyListing(Configuration configuration, Credentials credentials) {
|
||||
setConf(configuration);
|
||||
setCredentials(credentials);
|
||||
}
|
||||
|
||||
/**
|
||||
* set Credentials store, on which FS delegatin token will be cached
|
||||
* @param credentials - Credentials object
|
||||
*/
|
||||
protected void setCredentials(Credentials credentials) {
|
||||
this.credentials = credentials;
|
||||
}
|
||||
|
||||
/**
|
||||
* get credentials to update the delegation tokens for accessed FS objects
|
||||
* @return Credentials object
|
||||
*/
|
||||
protected Credentials getCredentials() {
|
||||
return credentials;
|
||||
}
|
||||
|
||||
/**
|
||||
* Public Factory method with which the appropriate CopyListing implementation may be retrieved.
|
||||
* @param configuration The input configuration.
|
||||
* @param credentials Credentials object on which the FS delegation tokens are cached
|
||||
* @param options The input Options, to help choose the appropriate CopyListing Implementation.
|
||||
* @return An instance of the appropriate CopyListing implementation.
|
||||
*/
|
||||
public static CopyListing getCopyListing(Configuration configuration,
|
||||
Credentials credentials,
|
||||
DistCpOptions options) {
|
||||
if (options.getSourceFileListing() == null) {
|
||||
return new GlobbedCopyListing(configuration, credentials);
|
||||
} else {
|
||||
return new FileBasedCopyListing(configuration, credentials);
|
||||
}
|
||||
}
|
||||
|
||||
static class DuplicateFileException extends RuntimeException {
|
||||
public DuplicateFileException(String message) {
|
||||
super(message);
|
||||
}
|
||||
}
|
||||
|
||||
static class InvalidInputException extends RuntimeException {
|
||||
public InvalidInputException(String message) {
|
||||
super(message);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,405 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.JobContext;
|
||||
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
|
||||
import org.apache.hadoop.mapreduce.Cluster;
|
||||
import org.apache.hadoop.tools.CopyListing.*;
|
||||
import org.apache.hadoop.tools.mapred.CopyMapper;
|
||||
import org.apache.hadoop.tools.mapred.CopyOutputFormat;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* DistCp is the main driver-class for DistCpV2.
|
||||
* For command-line use, DistCp::main() orchestrates the parsing of command-line
|
||||
* parameters and the launch of the DistCp job.
|
||||
* For programmatic use, a DistCp object can be constructed by specifying
|
||||
* options (in a DistCpOptions object), and DistCp::execute() may be used to
|
||||
* launch the copy-job. DistCp may alternatively be sub-classed to fine-tune
|
||||
* behaviour.
|
||||
*/
|
||||
public class DistCp extends Configured implements Tool {
|
||||
private static final Log LOG = LogFactory.getLog(DistCp.class);
|
||||
|
||||
private DistCpOptions inputOptions;
|
||||
private Path metaFolder;
|
||||
|
||||
private static final String PREFIX = "_distcp";
|
||||
private static final String WIP_PREFIX = "._WIP_";
|
||||
private static final String DISTCP_DEFAULT_XML = "distcp-default.xml";
|
||||
public static final Random rand = new Random();
|
||||
|
||||
private boolean submitted;
|
||||
private FileSystem jobFS;
|
||||
|
||||
/**
|
||||
* Public Constructor. Creates DistCp object with specified input-parameters.
|
||||
* (E.g. source-paths, target-location, etc.)
|
||||
* @param inputOptions Options (indicating source-paths, target-location.)
|
||||
* @param configuration The Hadoop configuration against which the Copy-mapper must run.
|
||||
* @throws Exception, on failure.
|
||||
*/
|
||||
public DistCp(Configuration configuration, DistCpOptions inputOptions) throws Exception {
|
||||
Configuration config = new Configuration(configuration);
|
||||
config.addResource(DISTCP_DEFAULT_XML);
|
||||
setConf(config);
|
||||
this.inputOptions = inputOptions;
|
||||
this.metaFolder = createMetaFolderPath();
|
||||
}
|
||||
|
||||
/**
|
||||
* To be used with the ToolRunner. Not for public consumption.
|
||||
*/
|
||||
private DistCp() {}
|
||||
|
||||
/**
|
||||
* Implementation of Tool::run(). Orchestrates the copy of source file(s)
|
||||
* to target location, by:
|
||||
* 1. Creating a list of files to be copied to target.
|
||||
* 2. Launching a Map-only job to copy the files. (Delegates to execute().)
|
||||
* @param argv List of arguments passed to DistCp, from the ToolRunner.
|
||||
* @return On success, it returns 0. Else, -1.
|
||||
*/
|
||||
public int run(String[] argv) {
|
||||
try {
|
||||
inputOptions = (OptionsParser.parse(argv));
|
||||
|
||||
LOG.info("Input Options: " + inputOptions);
|
||||
} catch (Throwable e) {
|
||||
LOG.error("Invalid arguments: ", e);
|
||||
System.err.println("Invalid arguments: " + e.getMessage());
|
||||
OptionsParser.usage();
|
||||
return DistCpConstants.INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
try {
|
||||
execute();
|
||||
} catch (InvalidInputException e) {
|
||||
LOG.error("Invalid input: ", e);
|
||||
return DistCpConstants.INVALID_ARGUMENT;
|
||||
} catch (DuplicateFileException e) {
|
||||
LOG.error("Duplicate files in input path: ", e);
|
||||
return DistCpConstants.DUPLICATE_INPUT;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Exception encountered ", e);
|
||||
return DistCpConstants.UNKNOWN_ERROR;
|
||||
}
|
||||
return DistCpConstants.SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements the core-execution. Creates the file-list for copy,
|
||||
* and launches the Hadoop-job, to do the copy.
|
||||
* @return Job handle
|
||||
* @throws Exception, on failure.
|
||||
*/
|
||||
public Job execute() throws Exception {
|
||||
assert inputOptions != null;
|
||||
assert getConf() != null;
|
||||
|
||||
Job job = null;
|
||||
try {
|
||||
metaFolder = createMetaFolderPath();
|
||||
jobFS = metaFolder.getFileSystem(getConf());
|
||||
|
||||
job = createJob();
|
||||
createInputFileListing(job);
|
||||
|
||||
job.submit();
|
||||
submitted = true;
|
||||
} finally {
|
||||
if (!submitted) {
|
||||
cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
String jobID = job.getJobID().toString();
|
||||
job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID);
|
||||
|
||||
LOG.info("DistCp job-id: " + jobID);
|
||||
if (inputOptions.shouldBlock()) {
|
||||
job.waitForCompletion(true);
|
||||
}
|
||||
return job;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create Job object for submitting it, with all the configuration
|
||||
*
|
||||
* @return Reference to job object.
|
||||
* @throws IOException - Exception if any
|
||||
*/
|
||||
private Job createJob() throws IOException {
|
||||
String jobName = "distcp";
|
||||
String userChosenName = getConf().get(JobContext.JOB_NAME);
|
||||
if (userChosenName != null)
|
||||
jobName += ": " + userChosenName;
|
||||
Job job = Job.getInstance(getConf());
|
||||
job.setJobName(jobName);
|
||||
job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions));
|
||||
job.setJarByClass(CopyMapper.class);
|
||||
configureOutputFormat(job);
|
||||
|
||||
job.setMapperClass(CopyMapper.class);
|
||||
job.setNumReduceTasks(0);
|
||||
job.setMapOutputKeyClass(Text.class);
|
||||
job.setMapOutputValueClass(Text.class);
|
||||
job.setOutputFormatClass(CopyOutputFormat.class);
|
||||
job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false");
|
||||
job.getConfiguration().set(JobContext.NUM_MAPS,
|
||||
String.valueOf(inputOptions.getMaxMaps()));
|
||||
|
||||
if (inputOptions.getSslConfigurationFile() != null) {
|
||||
setupSSLConfig(job);
|
||||
}
|
||||
|
||||
inputOptions.appendToConf(job.getConfiguration());
|
||||
return job;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup ssl configuration on the job configuration to enable hsftp access
|
||||
* from map job. Also copy the ssl configuration file to Distributed cache
|
||||
*
|
||||
* @param job - Reference to job's handle
|
||||
* @throws java.io.IOException - Exception if unable to locate ssl config file
|
||||
*/
|
||||
private void setupSSLConfig(Job job) throws IOException {
|
||||
Configuration configuration = job.getConfiguration();
|
||||
Path sslConfigPath = new Path(configuration.
|
||||
getResource(inputOptions.getSslConfigurationFile()).toString());
|
||||
|
||||
addSSLFilesToDistCache(job, sslConfigPath);
|
||||
configuration.set(DistCpConstants.CONF_LABEL_SSL_CONF, sslConfigPath.getName());
|
||||
configuration.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfigPath.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add SSL files to distributed cache. Trust store, key store and ssl config xml
|
||||
*
|
||||
* @param job - Job handle
|
||||
* @param sslConfigPath - ssl Configuration file specified through options
|
||||
* @throws IOException - If any
|
||||
*/
|
||||
private void addSSLFilesToDistCache(Job job,
|
||||
Path sslConfigPath) throws IOException {
|
||||
Configuration configuration = job.getConfiguration();
|
||||
FileSystem localFS = FileSystem.getLocal(configuration);
|
||||
|
||||
Configuration sslConf = new Configuration(false);
|
||||
sslConf.addResource(sslConfigPath);
|
||||
|
||||
Path localStorePath = getLocalStorePath(sslConf,
|
||||
DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION);
|
||||
job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
|
||||
localFS.getWorkingDirectory()).toUri());
|
||||
configuration.set(DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION,
|
||||
localStorePath.getName());
|
||||
|
||||
localStorePath = getLocalStorePath(sslConf,
|
||||
DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION);
|
||||
job.addCacheFile(localStorePath.makeQualified(localFS.getUri(),
|
||||
localFS.getWorkingDirectory()).toUri());
|
||||
configuration.set(DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION,
|
||||
localStorePath.getName());
|
||||
|
||||
job.addCacheFile(sslConfigPath.makeQualified(localFS.getUri(),
|
||||
localFS.getWorkingDirectory()).toUri());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Local Trust store/key store path
|
||||
*
|
||||
* @param sslConf - Config from SSL Client xml
|
||||
* @param storeKey - Key for either trust store or key store
|
||||
* @return - Path where the store is present
|
||||
* @throws IOException -If any
|
||||
*/
|
||||
private Path getLocalStorePath(Configuration sslConf, String storeKey) throws IOException {
|
||||
if (sslConf.get(storeKey) != null) {
|
||||
return new Path(sslConf.get(storeKey));
|
||||
} else {
|
||||
throw new IOException("Store for " + storeKey + " is not set in " +
|
||||
inputOptions.getSslConfigurationFile());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup output format appropriately
|
||||
*
|
||||
* @param job - Job handle
|
||||
* @throws IOException - Exception if any
|
||||
*/
|
||||
private void configureOutputFormat(Job job) throws IOException {
|
||||
final Configuration configuration = job.getConfiguration();
|
||||
Path targetPath = inputOptions.getTargetPath();
|
||||
FileSystem targetFS = targetPath.getFileSystem(configuration);
|
||||
targetPath = targetPath.makeQualified(targetFS.getUri(),
|
||||
targetFS.getWorkingDirectory());
|
||||
|
||||
if (inputOptions.shouldAtomicCommit()) {
|
||||
Path workDir = inputOptions.getAtomicWorkPath();
|
||||
if (workDir == null) {
|
||||
workDir = targetPath.getParent();
|
||||
}
|
||||
workDir = new Path(workDir, WIP_PREFIX + targetPath.getName()
|
||||
+ rand.nextInt());
|
||||
FileSystem workFS = workDir.getFileSystem(configuration);
|
||||
if (!DistCpUtils.compareFs(targetFS, workFS)) {
|
||||
throw new IllegalArgumentException("Work path " + workDir +
|
||||
" and target path " + targetPath + " are in different file system");
|
||||
}
|
||||
CopyOutputFormat.setWorkingDirectory(job, workDir);
|
||||
} else {
|
||||
CopyOutputFormat.setWorkingDirectory(job, targetPath);
|
||||
}
|
||||
CopyOutputFormat.setCommitDirectory(job, targetPath);
|
||||
|
||||
Path logPath = inputOptions.getLogPath();
|
||||
if (logPath == null) {
|
||||
logPath = new Path(metaFolder, "_logs");
|
||||
} else {
|
||||
LOG.info("DistCp job log path: " + logPath);
|
||||
}
|
||||
CopyOutputFormat.setOutputPath(job, logPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create input listing by invoking an appropriate copy listing
|
||||
* implementation. Also add delegation tokens for each path
|
||||
* to job's credential store
|
||||
*
|
||||
* @param job - Handle to job
|
||||
* @return Returns the path where the copy listing is created
|
||||
* @throws IOException - If any
|
||||
*/
|
||||
private Path createInputFileListing(Job job) throws IOException {
|
||||
Path fileListingPath = getFileListingPath();
|
||||
CopyListing copyListing = CopyListing.getCopyListing(job.getConfiguration(),
|
||||
job.getCredentials(), inputOptions);
|
||||
copyListing.buildListing(fileListingPath, inputOptions);
|
||||
return fileListingPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default name of the copy listing file. Use the meta folder
|
||||
* to create the copy listing file
|
||||
*
|
||||
* @return - Path where the copy listing file has to be saved
|
||||
* @throws IOException - Exception if any
|
||||
*/
|
||||
private Path getFileListingPath() throws IOException {
|
||||
String fileListPathStr = metaFolder + "/fileList.seq";
|
||||
Path path = new Path(fileListPathStr);
|
||||
return new Path(path.toUri().normalize().toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a default working folder for the job, under the
|
||||
* job staging directory
|
||||
*
|
||||
* @return Returns the working folder information
|
||||
* @throws Exception - EXception if any
|
||||
*/
|
||||
private Path createMetaFolderPath() throws Exception {
|
||||
Configuration configuration = getConf();
|
||||
Path stagingDir = JobSubmissionFiles.getStagingDir(
|
||||
new Cluster(configuration), configuration);
|
||||
Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt()));
|
||||
if (LOG.isDebugEnabled())
|
||||
LOG.debug("Meta folder location: " + metaFolderPath);
|
||||
configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString());
|
||||
return metaFolderPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function of the DistCp program. Parses the input arguments (via OptionsParser),
|
||||
* and invokes the DistCp::run() method, via the ToolRunner.
|
||||
* @param argv Command-line arguments sent to DistCp.
|
||||
*/
|
||||
public static void main(String argv[]) {
|
||||
try {
|
||||
DistCp distCp = new DistCp();
|
||||
Cleanup CLEANUP = new Cleanup(distCp);
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(CLEANUP);
|
||||
System.exit(ToolRunner.run(getDefaultConf(), distCp, argv));
|
||||
}
|
||||
catch (Exception e) {
|
||||
LOG.error("Couldn't complete DistCp operation: ", e);
|
||||
System.exit(DistCpConstants.UNKNOWN_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads properties from distcp-default.xml into configuration
|
||||
* object
|
||||
* @return Configuration which includes properties from distcp-default.xml
|
||||
*/
|
||||
private static Configuration getDefaultConf() {
|
||||
Configuration config = new Configuration();
|
||||
config.addResource(DISTCP_DEFAULT_XML);
|
||||
return config;
|
||||
}
|
||||
|
||||
private synchronized void cleanup() {
|
||||
try {
|
||||
if (metaFolder == null) return;
|
||||
|
||||
jobFS.delete(metaFolder, true);
|
||||
metaFolder = null;
|
||||
} catch (IOException e) {
|
||||
LOG.error("Unable to cleanup meta folder: " + metaFolder, e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSubmitted() {
|
||||
return submitted;
|
||||
}
|
||||
|
||||
private static class Cleanup extends Thread {
|
||||
private final DistCp distCp;
|
||||
|
||||
public Cleanup(DistCp distCp) {
|
||||
this.distCp = distCp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
if (distCp.isSubmitted()) return;
|
||||
|
||||
distCp.cleanup();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
package org.apache.hadoop.tools;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Utility class to hold commonly used constants.
|
||||
*/
|
||||
public class DistCpConstants {
|
||||
|
||||
/* Default number of maps to use for DistCp */
|
||||
public static final int DEFAULT_MAPS = 20;
|
||||
|
||||
/* Default bandwidth if none specified */
|
||||
public static final int DEFAULT_BANDWIDTH_MB = 100;
|
||||
|
||||
/* Default strategy for copying. Implementation looked up
|
||||
from distcp-default.xml
|
||||
*/
|
||||
public static final String UNIFORMSIZE = "uniformsize";
|
||||
|
||||
/**
|
||||
* Constants mapping to command line switches/input options
|
||||
*/
|
||||
public static final String CONF_LABEL_ATOMIC_COPY = "distcp.atomic.copy";
|
||||
public static final String CONF_LABEL_WORK_PATH = "distcp.work.path";
|
||||
public static final String CONF_LABEL_LOG_PATH = "distcp.log.path";
|
||||
public static final String CONF_LABEL_IGNORE_FAILURES = "distcp.ignore.failures";
|
||||
public static final String CONF_LABEL_PRESERVE_STATUS = "distcp.preserve.status";
|
||||
public static final String CONF_LABEL_SYNC_FOLDERS = "distcp.sync.folders";
|
||||
public static final String CONF_LABEL_DELETE_MISSING = "distcp.delete.missing.source";
|
||||
public static final String CONF_LABEL_SSL_CONF = "distcp.keystore.resource";
|
||||
public static final String CONF_LABEL_MAX_MAPS = "distcp.max.maps";
|
||||
public static final String CONF_LABEL_SOURCE_LISTING = "distcp.source.listing";
|
||||
public static final String CONF_LABEL_COPY_STRATEGY = "distcp.copy.strategy";
|
||||
public static final String CONF_LABEL_SKIP_CRC = "distcp.skip.crc";
|
||||
public static final String CONF_LABEL_OVERWRITE = "distcp.copy.overwrite";
|
||||
public static final String CONF_LABEL_BANDWIDTH_MB = "distcp.map.bandwidth.mb";
|
||||
|
||||
/* Total bytes to be copied. Updated by copylisting. Unfiltered count */
|
||||
public static final String CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED = "mapred.total.bytes.expected";
|
||||
|
||||
/* Total number of paths to copy, includes directories. Unfiltered count */
|
||||
public static final String CONF_LABEL_TOTAL_NUMBER_OF_RECORDS = "mapred.number.of.records";
|
||||
|
||||
/* SSL keystore resource */
|
||||
public static final String CONF_LABEL_SSL_KEYSTORE = "dfs.https.client.keystore.resource";
|
||||
|
||||
/* If input is based -f <<source listing>>, file containing the src paths */
|
||||
public static final String CONF_LABEL_LISTING_FILE_PATH = "distcp.listing.file.path";
|
||||
|
||||
/* Directory where the mapreduce job will write to. If not atomic commit, then same
|
||||
as CONF_LABEL_TARGET_FINAL_PATH
|
||||
*/
|
||||
public static final String CONF_LABEL_TARGET_WORK_PATH = "distcp.target.work.path";
|
||||
|
||||
/* Directory where the final data will be committed to. If not atomic commit, then same
|
||||
as CONF_LABEL_TARGET_WORK_PATH
|
||||
*/
|
||||
public static final String CONF_LABEL_TARGET_FINAL_PATH = "distcp.target.final.path";
|
||||
|
||||
/**
|
||||
* DistCp job id for consumers of the Disctp
|
||||
*/
|
||||
public static final String CONF_LABEL_DISTCP_JOB_ID = "distcp.job.id";
|
||||
|
||||
/* Meta folder where the job's intermediate data is kept */
|
||||
public static final String CONF_LABEL_META_FOLDER = "distcp.meta.folder";
|
||||
|
||||
/**
|
||||
* Conf label for SSL Trust-store location.
|
||||
*/
|
||||
public static final String CONF_LABEL_SSL_TRUST_STORE_LOCATION
|
||||
= "ssl.client.truststore.location";
|
||||
|
||||
/**
|
||||
* Conf label for SSL Key-store location.
|
||||
*/
|
||||
public static final String CONF_LABEL_SSL_KEY_STORE_LOCATION
|
||||
= "ssl.client.keystore.location";
|
||||
|
||||
/**
|
||||
* Constants for DistCp return code to shell / consumer of ToolRunner's run
|
||||
*/
|
||||
public static final int SUCCESS = 0;
|
||||
public static final int INVALID_ARGUMENT = -1;
|
||||
public static final int DUPLICATE_INPUT = -2;
|
||||
public static final int UNKNOWN_ERROR = -999;
|
||||
}
|
|
@ -0,0 +1,218 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.commons.cli.Option;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
/**
|
||||
* Enumeration mapping configuration keys to distcp command line
|
||||
* options.
|
||||
*/
|
||||
public enum DistCpOptionSwitch {
|
||||
|
||||
/**
|
||||
* Ignores any failures during copy, and continues with rest.
|
||||
* Logs failures in a file
|
||||
*/
|
||||
IGNORE_FAILURES(DistCpConstants.CONF_LABEL_IGNORE_FAILURES,
|
||||
new Option("i", false, "Ignore failures during copy")),
|
||||
|
||||
/**
|
||||
* Preserves status of file/path in the target.
|
||||
* Default behavior with -p, is to preserve replication,
|
||||
* block size, user, group and permission on the target file
|
||||
*
|
||||
* If any of the optional switches are present among rbugp, then
|
||||
* only the corresponding file attribute is preserved
|
||||
*
|
||||
*/
|
||||
PRESERVE_STATUS(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
|
||||
new Option("p", true, "preserve status (rbugp)" +
|
||||
"(replication, block-size, user, group, permission)")),
|
||||
|
||||
/**
|
||||
* Update target location by copying only files that are missing
|
||||
* in the target. This can be used to periodically sync two folders
|
||||
* across source and target. Typically used with DELETE_MISSING
|
||||
* Incompatible with ATOMIC_COMMIT
|
||||
*/
|
||||
SYNC_FOLDERS(DistCpConstants.CONF_LABEL_SYNC_FOLDERS,
|
||||
new Option("update", false, "Update target, copying only missing" +
|
||||
"files or directories")),
|
||||
|
||||
/**
|
||||
* Deletes missing files in target that are missing from source
|
||||
* This allows the target to be in sync with the source contents
|
||||
* Typically used in conjunction with SYNC_FOLDERS
|
||||
* Incompatible with ATOMIC_COMMIT
|
||||
*/
|
||||
DELETE_MISSING(DistCpConstants.CONF_LABEL_DELETE_MISSING,
|
||||
new Option("delete", false, "Delete from target, " +
|
||||
"files missing in source")),
|
||||
|
||||
/**
|
||||
* Configuration file to use with hftps:// for securely copying
|
||||
* files across clusters. Typically the configuration file contains
|
||||
* truststore/keystore information such as location, password and type
|
||||
*/
|
||||
SSL_CONF(DistCpConstants.CONF_LABEL_SSL_CONF,
|
||||
new Option("mapredSslConf", true, "Configuration for ssl config file" +
|
||||
", to use with hftps://")),
|
||||
|
||||
/**
|
||||
* Max number of maps to use during copy. DistCp will split work
|
||||
* as equally as possible among these maps
|
||||
*/
|
||||
MAX_MAPS(DistCpConstants.CONF_LABEL_MAX_MAPS,
|
||||
new Option("m", true, "Max number of concurrent maps to use for copy")),
|
||||
|
||||
/**
|
||||
* Source file listing can be provided to DistCp in a file.
|
||||
* This allows DistCp to copy random list of files from source
|
||||
* and copy them to target
|
||||
*/
|
||||
SOURCE_FILE_LISTING(DistCpConstants.CONF_LABEL_SOURCE_LISTING,
|
||||
new Option("f", true, "List of files that need to be copied")),
|
||||
|
||||
/**
|
||||
* Copy all the source files and commit them atomically to the target
|
||||
* This is typically useful in cases where there is a process
|
||||
* polling for availability of a file/dir. This option is incompatible
|
||||
* with SYNC_FOLDERS & DELETE_MISSING
|
||||
*/
|
||||
ATOMIC_COMMIT(DistCpConstants.CONF_LABEL_ATOMIC_COPY,
|
||||
new Option("atomic", false, "Commit all changes or none")),
|
||||
|
||||
/**
|
||||
* Work path to be used only in conjunction in Atomic commit
|
||||
*/
|
||||
WORK_PATH(DistCpConstants.CONF_LABEL_WORK_PATH,
|
||||
new Option("tmp", true, "Intermediate work path to be used for atomic commit")),
|
||||
|
||||
/**
|
||||
* Log path where distcp output logs are written to
|
||||
*/
|
||||
LOG_PATH(DistCpConstants.CONF_LABEL_LOG_PATH,
|
||||
new Option("log", true, "Folder on DFS where distcp execution logs are saved")),
|
||||
|
||||
/**
|
||||
* Copy strategy is use. This could be dynamic or uniform size etc.
|
||||
* DistCp would use an appropriate input format based on this.
|
||||
*/
|
||||
COPY_STRATEGY(DistCpConstants.CONF_LABEL_COPY_STRATEGY,
|
||||
new Option("strategy", true, "Copy strategy to use. Default is " +
|
||||
"dividing work based on file sizes")),
|
||||
|
||||
/**
|
||||
* Skip CRC checks between source and target, when determining what
|
||||
* files need to be copied.
|
||||
*/
|
||||
SKIP_CRC(DistCpConstants.CONF_LABEL_SKIP_CRC,
|
||||
new Option("skipcrccheck", false, "Whether to skip CRC checks between " +
|
||||
"source and target paths.")),
|
||||
|
||||
/**
|
||||
* Overwrite target-files unconditionally.
|
||||
*/
|
||||
OVERWRITE(DistCpConstants.CONF_LABEL_OVERWRITE,
|
||||
new Option("overwrite", false, "Choose to overwrite target files " +
|
||||
"unconditionally, even if they exist.")),
|
||||
|
||||
/**
|
||||
* Should DisctpExecution be blocking
|
||||
*/
|
||||
BLOCKING("",
|
||||
new Option("async", false, "Should distcp execution be blocking")),
|
||||
|
||||
FILE_LIMIT("",
|
||||
new Option("filelimit", true, "(Deprecated!) Limit number of files " +
|
||||
"copied to <= n")),
|
||||
|
||||
SIZE_LIMIT("",
|
||||
new Option("sizelimit", true, "(Deprecated!) Limit number of files " +
|
||||
"copied to <= n bytes")),
|
||||
|
||||
/**
|
||||
* Specify bandwidth per map in MB
|
||||
*/
|
||||
BANDWIDTH(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
|
||||
new Option("bandwidth", true, "Specify bandwidth per map in MB"));
|
||||
|
||||
private final String confLabel;
|
||||
private final Option option;
|
||||
|
||||
DistCpOptionSwitch(String confLabel, Option option) {
|
||||
this.confLabel = confLabel;
|
||||
this.option = option;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Configuration label for the option
|
||||
* @return configuration label name
|
||||
*/
|
||||
public String getConfigLabel() {
|
||||
return confLabel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get CLI Option corresponding to the distcp option
|
||||
* @return option
|
||||
*/
|
||||
public Option getOption() {
|
||||
return option;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Switch symbol
|
||||
* @return switch symbol char
|
||||
*/
|
||||
public String getSwitch() {
|
||||
return option.getOpt();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.name() + " {" +
|
||||
"confLabel='" + confLabel + '\'' +
|
||||
", option=" + option + '}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to add an option to hadoop configuration object
|
||||
* @param conf - Configuration object to include the option
|
||||
* @param option - Option to add
|
||||
* @param value - Value
|
||||
*/
|
||||
public static void addToConf(Configuration conf,
|
||||
DistCpOptionSwitch option,
|
||||
String value) {
|
||||
conf.set(option.getConfigLabel(), value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to set an option to hadoop configuration object
|
||||
* @param conf - Configuration object to include the option
|
||||
* @param option - Option to add
|
||||
*/
|
||||
public static void addToConf(Configuration conf,
|
||||
DistCpOptionSwitch option) {
|
||||
conf.set(option.getConfigLabel(), "true");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,525 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* The Options class encapsulates all DistCp options.
|
||||
* These may be set from command-line (via the OptionsParser)
|
||||
* or may be set manually.
|
||||
*/
|
||||
public class DistCpOptions {
|
||||
|
||||
private boolean atomicCommit = false;
|
||||
private boolean syncFolder = false;
|
||||
private boolean deleteMissing = false;
|
||||
private boolean ignoreFailures = false;
|
||||
private boolean overwrite = false;
|
||||
private boolean skipCRC = false;
|
||||
private boolean blocking = true;
|
||||
|
||||
private int maxMaps = DistCpConstants.DEFAULT_MAPS;
|
||||
private int mapBandwidth = DistCpConstants.DEFAULT_BANDWIDTH_MB;
|
||||
|
||||
private String sslConfigurationFile;
|
||||
|
||||
private String copyStrategy = DistCpConstants.UNIFORMSIZE;
|
||||
|
||||
private EnumSet<FileAttribute> preserveStatus = EnumSet.noneOf(FileAttribute.class);
|
||||
|
||||
private Path atomicWorkPath;
|
||||
|
||||
private Path logPath;
|
||||
|
||||
private Path sourceFileListing;
|
||||
private List<Path> sourcePaths;
|
||||
|
||||
private Path targetPath;
|
||||
|
||||
public static enum FileAttribute{
|
||||
REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION;
|
||||
|
||||
public static FileAttribute getAttribute(char symbol) {
|
||||
for (FileAttribute attribute : values()) {
|
||||
if (attribute.name().charAt(0) == Character.toUpperCase(symbol)) {
|
||||
return attribute;
|
||||
}
|
||||
}
|
||||
throw new NoSuchElementException("No attribute for " + symbol);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor, to initialize source/target paths.
|
||||
* @param sourcePaths List of source-paths (including wildcards)
|
||||
* to be copied to target.
|
||||
* @param targetPath Destination path for the dist-copy.
|
||||
*/
|
||||
public DistCpOptions(List<Path> sourcePaths, Path targetPath) {
|
||||
assert sourcePaths != null && !sourcePaths.isEmpty() : "Invalid source paths";
|
||||
assert targetPath != null : "Invalid Target path";
|
||||
|
||||
this.sourcePaths = sourcePaths;
|
||||
this.targetPath = targetPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor, to initialize source/target paths.
|
||||
* @param sourceFileListing File containing list of source paths
|
||||
* @param targetPath Destination path for the dist-copy.
|
||||
*/
|
||||
public DistCpOptions(Path sourceFileListing, Path targetPath) {
|
||||
assert sourceFileListing != null : "Invalid source paths";
|
||||
assert targetPath != null : "Invalid Target path";
|
||||
|
||||
this.sourceFileListing = sourceFileListing;
|
||||
this.targetPath = targetPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
* @param that DistCpOptions being copied from.
|
||||
*/
|
||||
public DistCpOptions(DistCpOptions that) {
|
||||
if (this != that && that != null) {
|
||||
this.atomicCommit = that.atomicCommit;
|
||||
this.syncFolder = that.syncFolder;
|
||||
this.deleteMissing = that.deleteMissing;
|
||||
this.ignoreFailures = that.ignoreFailures;
|
||||
this.overwrite = that.overwrite;
|
||||
this.skipCRC = that.skipCRC;
|
||||
this.blocking = that.blocking;
|
||||
this.maxMaps = that.maxMaps;
|
||||
this.mapBandwidth = that.mapBandwidth;
|
||||
this.sslConfigurationFile = that.getSslConfigurationFile();
|
||||
this.copyStrategy = that.copyStrategy;
|
||||
this.preserveStatus = that.preserveStatus;
|
||||
this.atomicWorkPath = that.getAtomicWorkPath();
|
||||
this.logPath = that.getLogPath();
|
||||
this.sourceFileListing = that.getSourceFileListing();
|
||||
this.sourcePaths = that.getSourcePaths();
|
||||
this.targetPath = that.getTargetPath();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Should the data be committed atomically?
|
||||
*
|
||||
* @return true if data should be committed automically. false otherwise
|
||||
*/
|
||||
public boolean shouldAtomicCommit() {
|
||||
return atomicCommit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if data need to be committed automatically
|
||||
*
|
||||
* @param atomicCommit - boolean switch
|
||||
*/
|
||||
public void setAtomicCommit(boolean atomicCommit) {
|
||||
validate(DistCpOptionSwitch.ATOMIC_COMMIT, atomicCommit);
|
||||
this.atomicCommit = atomicCommit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should the data be sync'ed between source and target paths?
|
||||
*
|
||||
* @return true if data should be sync'ed up. false otherwise
|
||||
*/
|
||||
public boolean shouldSyncFolder() {
|
||||
return syncFolder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if source and target folder contents be sync'ed up
|
||||
*
|
||||
* @param syncFolder - boolean switch
|
||||
*/
|
||||
public void setSyncFolder(boolean syncFolder) {
|
||||
validate(DistCpOptionSwitch.SYNC_FOLDERS, syncFolder);
|
||||
this.syncFolder = syncFolder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should target files missing in source should be deleted?
|
||||
*
|
||||
* @return true if zoombie target files to be removed. false otherwise
|
||||
*/
|
||||
public boolean shouldDeleteMissing() {
|
||||
return deleteMissing;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if files only present in target should be deleted
|
||||
*
|
||||
* @param deleteMissing - boolean switch
|
||||
*/
|
||||
public void setDeleteMissing(boolean deleteMissing) {
|
||||
validate(DistCpOptionSwitch.DELETE_MISSING, deleteMissing);
|
||||
this.deleteMissing = deleteMissing;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should failures be logged and ignored during copy?
|
||||
*
|
||||
* @return true if failures are to be logged and ignored. false otherwise
|
||||
*/
|
||||
public boolean shouldIgnoreFailures() {
|
||||
return ignoreFailures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if failures during copy be ignored
|
||||
*
|
||||
* @param ignoreFailures - boolean switch
|
||||
*/
|
||||
public void setIgnoreFailures(boolean ignoreFailures) {
|
||||
this.ignoreFailures = ignoreFailures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should DistCp be running in blocking mode
|
||||
*
|
||||
* @return true if should run in blocking, false otherwise
|
||||
*/
|
||||
public boolean shouldBlock() {
|
||||
return blocking;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if Disctp should run blocking or non-blocking
|
||||
*
|
||||
* @param blocking - boolean switch
|
||||
*/
|
||||
public void setBlocking(boolean blocking) {
|
||||
this.blocking = blocking;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should files be overwritten always?
|
||||
*
|
||||
* @return true if files in target that may exist before distcp, should always
|
||||
* be overwritten. false otherwise
|
||||
*/
|
||||
public boolean shouldOverwrite() {
|
||||
return overwrite;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if files should always be overwritten on target
|
||||
*
|
||||
* @param overwrite - boolean switch
|
||||
*/
|
||||
public void setOverwrite(boolean overwrite) {
|
||||
validate(DistCpOptionSwitch.OVERWRITE, overwrite);
|
||||
this.overwrite = overwrite;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should CRC/checksum check be skipped while checking files are identical
|
||||
*
|
||||
* @return true if checksum check should be skipped while checking files are
|
||||
* identical. false otherwise
|
||||
*/
|
||||
public boolean shouldSkipCRC() {
|
||||
return skipCRC;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if checksum comparison should be skipped while determining if
|
||||
* source and destination files are identical
|
||||
*
|
||||
* @param skipCRC - boolean switch
|
||||
*/
|
||||
public void setSkipCRC(boolean skipCRC) {
|
||||
validate(DistCpOptionSwitch.SKIP_CRC, skipCRC);
|
||||
this.skipCRC = skipCRC;
|
||||
}
|
||||
|
||||
/** Get the max number of maps to use for this copy
|
||||
*
|
||||
* @return Max number of maps
|
||||
*/
|
||||
public int getMaxMaps() {
|
||||
return maxMaps;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the max number of maps to use for copy
|
||||
*
|
||||
* @param maxMaps - Number of maps
|
||||
*/
|
||||
public void setMaxMaps(int maxMaps) {
|
||||
this.maxMaps = maxMaps;
|
||||
}
|
||||
|
||||
/** Get the map bandwidth in MB
|
||||
*
|
||||
* @return Bandwidth in MB
|
||||
*/
|
||||
public int getMapBandwidth() {
|
||||
return mapBandwidth;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set per map bandwidth
|
||||
*
|
||||
* @param mapBandwidth - per map bandwidth
|
||||
*/
|
||||
public void setMapBandwidth(int mapBandwidth) {
|
||||
assert mapBandwidth > 0 : "Bandwidth " + mapBandwidth + " is invalid (should be > 0)";
|
||||
this.mapBandwidth = mapBandwidth;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get path where the ssl configuration file is present to use for hftps://
|
||||
*
|
||||
* @return Path on local file system
|
||||
*/
|
||||
public String getSslConfigurationFile() {
|
||||
return sslConfigurationFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the SSL configuration file path to use with hftps:// (local path)
|
||||
*
|
||||
* @param sslConfigurationFile - Local ssl config file path
|
||||
*/
|
||||
public void setSslConfigurationFile(String sslConfigurationFile) {
|
||||
this.sslConfigurationFile = sslConfigurationFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator with the list of file attributes to preserve
|
||||
*
|
||||
* @return iterator of file attributes to preserve
|
||||
*/
|
||||
public Iterator<FileAttribute> preserveAttributes() {
|
||||
return preserveStatus.iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the input attibute should be preserved or not
|
||||
*
|
||||
* @param attribute - Attribute to check
|
||||
* @return True if attribute should be preserved, false otherwise
|
||||
*/
|
||||
public boolean shouldPreserve(FileAttribute attribute) {
|
||||
return preserveStatus.contains(attribute);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add file attributes that need to be preserved. This method may be
|
||||
* called multiple times to add attributes.
|
||||
*
|
||||
* @param fileAttribute - Attribute to add, one at a time
|
||||
*/
|
||||
public void preserve(FileAttribute fileAttribute) {
|
||||
for (FileAttribute attribute : preserveStatus) {
|
||||
if (attribute.equals(fileAttribute)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
preserveStatus.add(fileAttribute);
|
||||
}
|
||||
|
||||
/** Get work path for atomic commit. If null, the work
|
||||
* path would be parentOf(targetPath) + "/._WIP_" + nameOf(targetPath)
|
||||
*
|
||||
* @return Atomic work path on the target cluster. Null if not set
|
||||
*/
|
||||
public Path getAtomicWorkPath() {
|
||||
return atomicWorkPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the work path for atomic commit
|
||||
*
|
||||
* @param atomicWorkPath - Path on the target cluster
|
||||
*/
|
||||
public void setAtomicWorkPath(Path atomicWorkPath) {
|
||||
this.atomicWorkPath = atomicWorkPath;
|
||||
}
|
||||
|
||||
/** Get output directory for writing distcp logs. Otherwise logs
|
||||
* are temporarily written to JobStagingDir/_logs and deleted
|
||||
* upon job completion
|
||||
*
|
||||
* @return Log output path on the cluster where distcp job is run
|
||||
*/
|
||||
public Path getLogPath() {
|
||||
return logPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the log path where distcp output logs are stored
|
||||
* Uses JobStagingDir/_logs by default
|
||||
*
|
||||
* @param logPath - Path where logs will be saved
|
||||
*/
|
||||
public void setLogPath(Path logPath) {
|
||||
this.logPath = logPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the copy strategy to use. Uses appropriate input format
|
||||
*
|
||||
* @return copy strategy to use
|
||||
*/
|
||||
public String getCopyStrategy() {
|
||||
return copyStrategy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the copy strategy to use. Should map to a strategy implementation
|
||||
* in distp-default.xml
|
||||
*
|
||||
* @param copyStrategy - copy Strategy to use
|
||||
*/
|
||||
public void setCopyStrategy(String copyStrategy) {
|
||||
this.copyStrategy = copyStrategy;
|
||||
}
|
||||
|
||||
/**
|
||||
* File path (hdfs:// or file://) that contains the list of actual
|
||||
* files to copy
|
||||
*
|
||||
* @return - Source listing file path
|
||||
*/
|
||||
public Path getSourceFileListing() {
|
||||
return sourceFileListing;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for sourcePaths.
|
||||
* @return List of source-paths.
|
||||
*/
|
||||
public List<Path> getSourcePaths() {
|
||||
return sourcePaths;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter for sourcePaths.
|
||||
* @param sourcePaths The new list of source-paths.
|
||||
*/
|
||||
public void setSourcePaths(List<Path> sourcePaths) {
|
||||
assert sourcePaths != null && sourcePaths.size() != 0;
|
||||
this.sourcePaths = sourcePaths;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for the targetPath.
|
||||
* @return The target-path.
|
||||
*/
|
||||
public Path getTargetPath() {
|
||||
return targetPath;
|
||||
}
|
||||
|
||||
public void validate(DistCpOptionSwitch option, boolean value) {
|
||||
|
||||
boolean syncFolder = (option == DistCpOptionSwitch.SYNC_FOLDERS ?
|
||||
value : this.syncFolder);
|
||||
boolean overwrite = (option == DistCpOptionSwitch.OVERWRITE ?
|
||||
value : this.overwrite);
|
||||
boolean deleteMissing = (option == DistCpOptionSwitch.DELETE_MISSING ?
|
||||
value : this.deleteMissing);
|
||||
boolean atomicCommit = (option == DistCpOptionSwitch.ATOMIC_COMMIT ?
|
||||
value : this.atomicCommit);
|
||||
boolean skipCRC = (option == DistCpOptionSwitch.SKIP_CRC ?
|
||||
value : this.skipCRC);
|
||||
|
||||
if (syncFolder && atomicCommit) {
|
||||
throw new IllegalArgumentException("Atomic commit can't be used with " +
|
||||
"sync folder or overwrite options");
|
||||
}
|
||||
|
||||
if (deleteMissing && !(overwrite || syncFolder)) {
|
||||
throw new IllegalArgumentException("Delete missing is applicable " +
|
||||
"only with update or overwrite options");
|
||||
}
|
||||
|
||||
if (overwrite && syncFolder) {
|
||||
throw new IllegalArgumentException("Overwrite and update options are " +
|
||||
"mutually exclusive");
|
||||
}
|
||||
|
||||
if (!syncFolder && skipCRC) {
|
||||
throw new IllegalArgumentException("Skip CRC is valid only with update options");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Add options to configuration. These will be used in the Mapper/committer
|
||||
*
|
||||
* @param conf - Configruation object to which the options need to be added
|
||||
*/
|
||||
public void appendToConf(Configuration conf) {
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT,
|
||||
String.valueOf(atomicCommit));
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.IGNORE_FAILURES,
|
||||
String.valueOf(ignoreFailures));
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SYNC_FOLDERS,
|
||||
String.valueOf(syncFolder));
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DELETE_MISSING,
|
||||
String.valueOf(deleteMissing));
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.OVERWRITE,
|
||||
String.valueOf(overwrite));
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SKIP_CRC,
|
||||
String.valueOf(skipCRC));
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BANDWIDTH,
|
||||
String.valueOf(mapBandwidth));
|
||||
DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.PRESERVE_STATUS,
|
||||
DistCpUtils.packAttributes(preserveStatus));
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility to easily string-ify Options, for logging.
|
||||
*
|
||||
* @return String representation of the Options.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DistCpOptions{" +
|
||||
"atomicCommit=" + atomicCommit +
|
||||
", syncFolder=" + syncFolder +
|
||||
", deleteMissing=" + deleteMissing +
|
||||
", ignoreFailures=" + ignoreFailures +
|
||||
", maxMaps=" + maxMaps +
|
||||
", sslConfigurationFile='" + sslConfigurationFile + '\'' +
|
||||
", copyStrategy='" + copyStrategy + '\'' +
|
||||
", sourceFileListing=" + sourceFileListing +
|
||||
", sourcePaths=" + sourcePaths +
|
||||
", targetPath=" + targetPath +
|
||||
'}';
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DistCpOptions clone() throws CloneNotSupportedException {
|
||||
return (DistCpOptions) super.clone();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* FileBasedCopyListing implements the CopyListing interface,
|
||||
* to create the copy-listing for DistCp,
|
||||
* by iterating over all source paths mentioned in a specified input-file.
|
||||
*/
|
||||
public class FileBasedCopyListing extends CopyListing {
|
||||
|
||||
private final CopyListing globbedListing;
|
||||
/**
|
||||
* Constructor, to initialize base-class.
|
||||
* @param configuration The input Configuration object.
|
||||
* @param credentials - Credentials object on which the FS delegation tokens are cached. If null
|
||||
* delegation token caching is skipped
|
||||
*/
|
||||
public FileBasedCopyListing(Configuration configuration, Credentials credentials) {
|
||||
super(configuration, credentials);
|
||||
globbedListing = new GlobbedCopyListing(getConf(), credentials);
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected void validatePaths(DistCpOptions options)
|
||||
throws IOException, InvalidInputException {
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of CopyListing::buildListing().
|
||||
* Iterates over all source paths mentioned in the input-file.
|
||||
* @param pathToListFile Path on HDFS where the listing file is written.
|
||||
* @param options Input Options for DistCp (indicating source/target paths.)
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public void doBuildListing(Path pathToListFile, DistCpOptions options) throws IOException {
|
||||
DistCpOptions newOption = new DistCpOptions(options);
|
||||
newOption.setSourcePaths(fetchFileList(options.getSourceFileListing()));
|
||||
globbedListing.buildListing(pathToListFile, newOption);
|
||||
}
|
||||
|
||||
private List<Path> fetchFileList(Path sourceListing) throws IOException {
|
||||
List<Path> result = new ArrayList<Path>();
|
||||
FileSystem fs = sourceListing.getFileSystem(getConf());
|
||||
BufferedReader input = null;
|
||||
try {
|
||||
input = new BufferedReader(new InputStreamReader(fs.open(sourceListing)));
|
||||
String line = input.readLine();
|
||||
while (line != null) {
|
||||
result.add(new Path(line));
|
||||
line = input.readLine();
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeStream(input);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected long getBytesToCopy() {
|
||||
return globbedListing.getBytesToCopy();
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected long getNumberOfPaths() {
|
||||
return globbedListing.getNumberOfPaths();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* GlobbedCopyListing implements the CopyListing interface, to create the copy
|
||||
* listing-file by "globbing" all specified source paths (wild-cards and all.)
|
||||
*/
|
||||
public class GlobbedCopyListing extends CopyListing {
|
||||
private static final Log LOG = LogFactory.getLog(GlobbedCopyListing.class);
|
||||
|
||||
private final CopyListing simpleListing;
|
||||
/**
|
||||
* Constructor, to initialize the configuration.
|
||||
* @param configuration The input Configuration object.
|
||||
* @param credentials Credentials object on which the FS delegation tokens are cached. If null
|
||||
* delegation token caching is skipped
|
||||
*/
|
||||
public GlobbedCopyListing(Configuration configuration, Credentials credentials) {
|
||||
super(configuration, credentials);
|
||||
simpleListing = new SimpleCopyListing(getConf(), credentials) ;
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected void validatePaths(DistCpOptions options)
|
||||
throws IOException, InvalidInputException {
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of CopyListing::buildListing().
|
||||
* Creates the copy listing by "globbing" all source-paths.
|
||||
* @param pathToListingFile The location at which the copy-listing file
|
||||
* is to be created.
|
||||
* @param options Input Options for DistCp (indicating source/target paths.)
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public void doBuildListing(Path pathToListingFile,
|
||||
DistCpOptions options) throws IOException {
|
||||
|
||||
List<Path> globbedPaths = new ArrayList<Path>();
|
||||
if (options.getSourcePaths().isEmpty()) {
|
||||
throw new InvalidInputException("Nothing to process. Source paths::EMPTY");
|
||||
}
|
||||
|
||||
for (Path p : options.getSourcePaths()) {
|
||||
FileSystem fs = p.getFileSystem(getConf());
|
||||
FileStatus[] inputs = fs.globStatus(p);
|
||||
|
||||
if(inputs != null && inputs.length > 0) {
|
||||
for (FileStatus onePath: inputs) {
|
||||
globbedPaths.add(onePath.getPath());
|
||||
}
|
||||
} else {
|
||||
throw new InvalidInputException(p + " doesn't exist");
|
||||
}
|
||||
}
|
||||
|
||||
DistCpOptions optionsGlobbed = new DistCpOptions(options);
|
||||
optionsGlobbed.setSourcePaths(globbedPaths);
|
||||
simpleListing.buildListing(pathToListingFile, optionsGlobbed);
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected long getBytesToCopy() {
|
||||
return simpleListing.getBytesToCopy();
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected long getNumberOfPaths() {
|
||||
return simpleListing.getNumberOfPaths();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,246 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.commons.cli.*;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The OptionsParser parses out the command-line options passed to DistCp,
|
||||
* and interprets those specific to DistCp, to create an Options object.
|
||||
*/
|
||||
public class OptionsParser {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(OptionsParser.class);
|
||||
|
||||
private static final Options cliOptions = new Options();
|
||||
|
||||
static {
|
||||
for (DistCpOptionSwitch option : DistCpOptionSwitch.values()) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Adding option " + option.getOption());
|
||||
}
|
||||
cliOptions.addOption(option.getOption());
|
||||
}
|
||||
}
|
||||
|
||||
private static class CustomParser extends GnuParser {
|
||||
@Override
|
||||
protected String[] flatten(Options options, String[] arguments, boolean stopAtNonOption) {
|
||||
for (int index = 0; index < arguments.length; index++) {
|
||||
if (arguments[index].equals("-" + DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
|
||||
arguments[index] = "-prbugp";
|
||||
}
|
||||
}
|
||||
return super.flatten(options, arguments, stopAtNonOption);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The parse method parses the command-line options, and creates
|
||||
* a corresponding Options object.
|
||||
* @param args Command-line arguments (excluding the options consumed
|
||||
* by the GenericOptionsParser).
|
||||
* @return The Options object, corresponding to the specified command-line.
|
||||
* @throws IllegalArgumentException: Thrown if the parse fails.
|
||||
*/
|
||||
public static DistCpOptions parse(String args[]) throws IllegalArgumentException {
|
||||
|
||||
CommandLineParser parser = new CustomParser();
|
||||
|
||||
CommandLine command;
|
||||
try {
|
||||
command = parser.parse(cliOptions, args, true);
|
||||
} catch (ParseException e) {
|
||||
throw new IllegalArgumentException("Unable to parse arguments. " +
|
||||
Arrays.toString(args), e);
|
||||
}
|
||||
|
||||
DistCpOptions option;
|
||||
Path targetPath;
|
||||
List<Path> sourcePaths = new ArrayList<Path>();
|
||||
|
||||
String leftOverArgs[] = command.getArgs();
|
||||
if (leftOverArgs == null || leftOverArgs.length < 1) {
|
||||
throw new IllegalArgumentException("Target path not specified");
|
||||
}
|
||||
|
||||
//Last Argument is the target path
|
||||
targetPath = new Path(leftOverArgs[leftOverArgs.length -1].trim());
|
||||
|
||||
//Copy any source paths in the arguments to the list
|
||||
for (int index = 0; index < leftOverArgs.length - 1; index++) {
|
||||
sourcePaths.add(new Path(leftOverArgs[index].trim()));
|
||||
}
|
||||
|
||||
/* If command has source file listing, use it else, fall back on source paths in args
|
||||
If both are present, throw exception and bail */
|
||||
if (command.hasOption(DistCpOptionSwitch.SOURCE_FILE_LISTING.getSwitch())) {
|
||||
if (!sourcePaths.isEmpty()) {
|
||||
throw new IllegalArgumentException("Both source file listing and source paths present");
|
||||
}
|
||||
option = new DistCpOptions(new Path(getVal(command, DistCpOptionSwitch.
|
||||
SOURCE_FILE_LISTING.getSwitch())), targetPath);
|
||||
} else {
|
||||
if (sourcePaths.isEmpty()) {
|
||||
throw new IllegalArgumentException("Neither source file listing nor source paths present");
|
||||
}
|
||||
option = new DistCpOptions(sourcePaths, targetPath);
|
||||
}
|
||||
|
||||
//Process all the other option switches and set options appropriately
|
||||
if (command.hasOption(DistCpOptionSwitch.IGNORE_FAILURES.getSwitch())) {
|
||||
option.setIgnoreFailures(true);
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.ATOMIC_COMMIT.getSwitch())) {
|
||||
option.setAtomicCommit(true);
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch()) &&
|
||||
option.shouldAtomicCommit()) {
|
||||
String workPath = getVal(command, DistCpOptionSwitch.WORK_PATH.getSwitch());
|
||||
if (workPath != null && !workPath.isEmpty()) {
|
||||
option.setAtomicWorkPath(new Path(workPath));
|
||||
}
|
||||
} else if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch())) {
|
||||
throw new IllegalArgumentException("-tmp work-path can only be specified along with -atomic");
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.LOG_PATH.getSwitch())) {
|
||||
option.setLogPath(new Path(getVal(command, DistCpOptionSwitch.LOG_PATH.getSwitch())));
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.SYNC_FOLDERS.getSwitch())) {
|
||||
option.setSyncFolder(true);
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.OVERWRITE.getSwitch())) {
|
||||
option.setOverwrite(true);
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.DELETE_MISSING.getSwitch())) {
|
||||
option.setDeleteMissing(true);
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.SKIP_CRC.getSwitch())) {
|
||||
option.setSkipCRC(true);
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.BLOCKING.getSwitch())) {
|
||||
option.setBlocking(false);
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.BANDWIDTH.getSwitch())) {
|
||||
try {
|
||||
Integer mapBandwidth = Integer.parseInt(
|
||||
getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()).trim());
|
||||
option.setMapBandwidth(mapBandwidth);
|
||||
} catch (NumberFormatException e) {
|
||||
throw new IllegalArgumentException("Bandwidth specified is invalid: " +
|
||||
getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()), e);
|
||||
}
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.SSL_CONF.getSwitch())) {
|
||||
option.setSslConfigurationFile(command.
|
||||
getOptionValue(DistCpOptionSwitch.SSL_CONF.getSwitch()));
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.MAX_MAPS.getSwitch())) {
|
||||
try {
|
||||
Integer maps = Integer.parseInt(
|
||||
getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()).trim());
|
||||
option.setMaxMaps(maps);
|
||||
} catch (NumberFormatException e) {
|
||||
throw new IllegalArgumentException("Number of maps is invalid: " +
|
||||
getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()), e);
|
||||
}
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.COPY_STRATEGY.getSwitch())) {
|
||||
option.setCopyStrategy(
|
||||
getVal(command, DistCpOptionSwitch.COPY_STRATEGY.getSwitch()));
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) {
|
||||
String attributes =
|
||||
getVal(command, DistCpOptionSwitch.PRESERVE_STATUS.getSwitch());
|
||||
if (attributes == null || attributes.isEmpty()) {
|
||||
for (FileAttribute attribute : FileAttribute.values()) {
|
||||
option.preserve(attribute);
|
||||
}
|
||||
} else {
|
||||
for (int index = 0; index < attributes.length(); index++) {
|
||||
option.preserve(FileAttribute.
|
||||
getAttribute(attributes.charAt(index)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.FILE_LIMIT.getSwitch())) {
|
||||
String fileLimitString = getVal(command,
|
||||
DistCpOptionSwitch.FILE_LIMIT.getSwitch().trim());
|
||||
try {
|
||||
Integer.parseInt(fileLimitString);
|
||||
}
|
||||
catch (NumberFormatException e) {
|
||||
throw new IllegalArgumentException("File-limit is invalid: "
|
||||
+ fileLimitString, e);
|
||||
}
|
||||
LOG.warn(DistCpOptionSwitch.FILE_LIMIT.getSwitch() + " is a deprecated" +
|
||||
" option. Ignoring.");
|
||||
}
|
||||
|
||||
if (command.hasOption(DistCpOptionSwitch.SIZE_LIMIT.getSwitch())) {
|
||||
String sizeLimitString = getVal(command,
|
||||
DistCpOptionSwitch.SIZE_LIMIT.getSwitch().trim());
|
||||
try {
|
||||
Long.parseLong(sizeLimitString);
|
||||
}
|
||||
catch (NumberFormatException e) {
|
||||
throw new IllegalArgumentException("Size-limit is invalid: "
|
||||
+ sizeLimitString, e);
|
||||
}
|
||||
LOG.warn(DistCpOptionSwitch.SIZE_LIMIT.getSwitch() + " is a deprecated" +
|
||||
" option. Ignoring.");
|
||||
}
|
||||
|
||||
return option;
|
||||
}
|
||||
|
||||
private static String getVal(CommandLine command, String swtch) {
|
||||
String optionValue = command.getOptionValue(swtch);
|
||||
if (optionValue == null) {
|
||||
return null;
|
||||
} else {
|
||||
return optionValue.trim();
|
||||
}
|
||||
}
|
||||
|
||||
public static void usage() {
|
||||
HelpFormatter formatter = new HelpFormatter();
|
||||
formatter.printHelp("distcp OPTIONS [source_path...] <target_path>\n\nOPTIONS", cliOptions);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,275 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.DataInputBuffer;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.mapreduce.security.TokenCache;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
* The SimpleCopyListing is responsible for making the exhaustive list of
|
||||
* all files/directories under its specified list of input-paths.
|
||||
* These are written into the specified copy-listing file.
|
||||
* Note: The SimpleCopyListing doesn't handle wild-cards in the input-paths.
|
||||
*/
|
||||
public class SimpleCopyListing extends CopyListing {
|
||||
private static final Log LOG = LogFactory.getLog(SimpleCopyListing.class);
|
||||
|
||||
private long totalPaths = 0;
|
||||
private long totalBytesToCopy = 0;
|
||||
|
||||
/**
|
||||
* Protected constructor, to initialize configuration.
|
||||
*
|
||||
* @param configuration The input configuration, with which the source/target FileSystems may be accessed.
|
||||
* @param credentials - Credentials object on which the FS delegation tokens are cached. If null
|
||||
* delegation token caching is skipped
|
||||
*/
|
||||
protected SimpleCopyListing(Configuration configuration, Credentials credentials) {
|
||||
super(configuration, credentials);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validatePaths(DistCpOptions options)
|
||||
throws IOException, InvalidInputException {
|
||||
|
||||
Path targetPath = options.getTargetPath();
|
||||
FileSystem targetFS = targetPath.getFileSystem(getConf());
|
||||
boolean targetIsFile = targetFS.isFile(targetPath);
|
||||
|
||||
//If target is a file, then source has to be single file
|
||||
if (targetIsFile) {
|
||||
if (options.getSourcePaths().size() > 1) {
|
||||
throw new InvalidInputException("Multiple source being copied to a file: " +
|
||||
targetPath);
|
||||
}
|
||||
|
||||
Path srcPath = options.getSourcePaths().get(0);
|
||||
FileSystem sourceFS = srcPath.getFileSystem(getConf());
|
||||
if (!sourceFS.isFile(srcPath)) {
|
||||
throw new InvalidInputException("Cannot copy " + srcPath +
|
||||
", which is not a file to " + targetPath);
|
||||
}
|
||||
}
|
||||
|
||||
if (options.shouldAtomicCommit() && targetFS.exists(targetPath)) {
|
||||
throw new InvalidInputException("Target path for atomic-commit already exists: " +
|
||||
targetPath + ". Cannot atomic-commit to pre-existing target-path.");
|
||||
}
|
||||
|
||||
for (Path path: options.getSourcePaths()) {
|
||||
FileSystem fs = path.getFileSystem(getConf());
|
||||
if (!fs.exists(path)) {
|
||||
throw new InvalidInputException(path + " doesn't exist");
|
||||
}
|
||||
}
|
||||
|
||||
/* This is requires to allow map tasks to access each of the source
|
||||
clusters. This would retrieve the delegation token for each unique
|
||||
file system and add them to job's private credential store
|
||||
*/
|
||||
Credentials credentials = getCredentials();
|
||||
if (credentials != null) {
|
||||
Path[] inputPaths = options.getSourcePaths().toArray(new Path[1]);
|
||||
TokenCache.obtainTokensForNamenodes(credentials, inputPaths, getConf());
|
||||
}
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException {
|
||||
|
||||
SequenceFile.Writer fileListWriter = null;
|
||||
|
||||
try {
|
||||
fileListWriter = getWriter(pathToListingFile);
|
||||
|
||||
for (Path path: options.getSourcePaths()) {
|
||||
FileSystem sourceFS = path.getFileSystem(getConf());
|
||||
path = makeQualified(path);
|
||||
|
||||
FileStatus rootStatus = sourceFS.getFileStatus(path);
|
||||
Path sourcePathRoot = computeSourceRootPath(rootStatus, options);
|
||||
boolean localFile = (rootStatus.getClass() != FileStatus.class);
|
||||
|
||||
FileStatus[] sourceFiles = sourceFS.listStatus(path);
|
||||
if (sourceFiles != null && sourceFiles.length > 0) {
|
||||
for (FileStatus sourceStatus: sourceFiles) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Recording source-path: " + sourceStatus.getPath() + " for copy.");
|
||||
}
|
||||
writeToFileListing(fileListWriter, sourceStatus, sourcePathRoot, localFile);
|
||||
|
||||
if (isDirectoryAndNotEmpty(sourceFS, sourceStatus)) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Traversing non-empty source dir: " + sourceStatus.getPath());
|
||||
}
|
||||
traverseNonEmptyDirectory(fileListWriter, sourceStatus, sourcePathRoot, localFile);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
writeToFileListing(fileListWriter, rootStatus, sourcePathRoot, localFile);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeStream(fileListWriter);
|
||||
}
|
||||
}
|
||||
|
||||
private Path computeSourceRootPath(FileStatus sourceStatus,
|
||||
DistCpOptions options) throws IOException {
|
||||
|
||||
Path target = options.getTargetPath();
|
||||
FileSystem targetFS = target.getFileSystem(getConf());
|
||||
|
||||
boolean solitaryFile = options.getSourcePaths().size() == 1
|
||||
&& !sourceStatus.isDirectory();
|
||||
|
||||
if (solitaryFile) {
|
||||
if (targetFS.isFile(target) || !targetFS.exists(target)) {
|
||||
return sourceStatus.getPath();
|
||||
} else {
|
||||
return sourceStatus.getPath().getParent();
|
||||
}
|
||||
} else {
|
||||
boolean specialHandling = (options.getSourcePaths().size() == 1 && !targetFS.exists(target)) ||
|
||||
options.shouldSyncFolder() || options.shouldOverwrite();
|
||||
|
||||
return specialHandling && sourceStatus.isDirectory() ? sourceStatus.getPath() :
|
||||
sourceStatus.getPath().getParent();
|
||||
}
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected long getBytesToCopy() {
|
||||
return totalBytesToCopy;
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
protected long getNumberOfPaths() {
|
||||
return totalPaths;
|
||||
}
|
||||
|
||||
private Path makeQualified(Path path) throws IOException {
|
||||
final FileSystem fs = path.getFileSystem(getConf());
|
||||
return path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
|
||||
}
|
||||
|
||||
private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException {
|
||||
FileSystem fs = pathToListFile.getFileSystem(getConf());
|
||||
if (fs.exists(pathToListFile)) {
|
||||
fs.delete(pathToListFile, false);
|
||||
}
|
||||
return SequenceFile.createWriter(getConf(),
|
||||
SequenceFile.Writer.file(pathToListFile),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(FileStatus.class),
|
||||
SequenceFile.Writer.compression(SequenceFile.CompressionType.NONE));
|
||||
}
|
||||
|
||||
private static boolean isDirectoryAndNotEmpty(FileSystem fileSystem,
|
||||
FileStatus fileStatus) throws IOException {
|
||||
return fileStatus.isDirectory() && getChildren(fileSystem, fileStatus).length > 0;
|
||||
}
|
||||
|
||||
private static FileStatus[] getChildren(FileSystem fileSystem,
|
||||
FileStatus parent) throws IOException {
|
||||
return fileSystem.listStatus(parent.getPath());
|
||||
}
|
||||
|
||||
private void traverseNonEmptyDirectory(SequenceFile.Writer fileListWriter,
|
||||
FileStatus sourceStatus,
|
||||
Path sourcePathRoot, boolean localFile)
|
||||
throws IOException {
|
||||
FileSystem sourceFS = sourcePathRoot.getFileSystem(getConf());
|
||||
Stack<FileStatus> pathStack = new Stack<FileStatus>();
|
||||
pathStack.push(sourceStatus);
|
||||
|
||||
while (!pathStack.isEmpty()) {
|
||||
for (FileStatus child: getChildren(sourceFS, pathStack.pop())) {
|
||||
if (LOG.isDebugEnabled())
|
||||
LOG.debug("Recording source-path: "
|
||||
+ sourceStatus.getPath() + " for copy.");
|
||||
writeToFileListing(fileListWriter, child, sourcePathRoot, localFile);
|
||||
if (isDirectoryAndNotEmpty(sourceFS, child)) {
|
||||
if (LOG.isDebugEnabled())
|
||||
LOG.debug("Traversing non-empty source dir: "
|
||||
+ sourceStatus.getPath());
|
||||
pathStack.push(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeToFileListing(SequenceFile.Writer fileListWriter,
|
||||
FileStatus fileStatus, Path sourcePathRoot,
|
||||
boolean localFile) throws IOException {
|
||||
if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDirectory())
|
||||
return; // Skip the root-paths.
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("REL PATH: " + DistCpUtils.getRelativePath(sourcePathRoot,
|
||||
fileStatus.getPath()) + ", FULL PATH: " + fileStatus.getPath());
|
||||
}
|
||||
|
||||
FileStatus status = fileStatus;
|
||||
if (localFile) {
|
||||
status = getFileStatus(fileStatus);
|
||||
}
|
||||
|
||||
fileListWriter.append(new Text(DistCpUtils.getRelativePath(sourcePathRoot,
|
||||
fileStatus.getPath())), status);
|
||||
fileListWriter.sync();
|
||||
|
||||
if (!fileStatus.isDirectory()) {
|
||||
totalBytesToCopy += fileStatus.getLen();
|
||||
}
|
||||
totalPaths++;
|
||||
}
|
||||
|
||||
private static final ByteArrayOutputStream buffer = new ByteArrayOutputStream(64);
|
||||
private DataInputBuffer in = new DataInputBuffer();
|
||||
|
||||
private FileStatus getFileStatus(FileStatus fileStatus) throws IOException {
|
||||
FileStatus status = new FileStatus();
|
||||
|
||||
buffer.reset();
|
||||
DataOutputStream out = new DataOutputStream(buffer);
|
||||
fileStatus.write(out);
|
||||
|
||||
in.reset(buffer.toByteArray(), 0, buffer.size());
|
||||
status.readFields(in);
|
||||
return status;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,297 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools.mapred;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.*;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
|
||||
import org.apache.hadoop.tools.*;
|
||||
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The CopyCommitter class is DistCp's OutputCommitter implementation. It is
|
||||
* responsible for handling the completion/cleanup of the DistCp run.
|
||||
* Specifically, it does the following:
|
||||
* 1. Cleanup of the meta-folder (where DistCp maintains its file-list, etc.)
|
||||
* 2. Preservation of user/group/replication-factor on any directories that
|
||||
* have been copied. (Files are taken care of in their map-tasks.)
|
||||
* 3. Atomic-move of data from the temporary work-folder to the final path
|
||||
* (if atomic-commit was opted for).
|
||||
* 4. Deletion of files from the target that are missing at source (if opted for).
|
||||
* 5. Cleanup of any partially copied files, from previous, failed attempts.
|
||||
*/
|
||||
public class CopyCommitter extends FileOutputCommitter {
|
||||
private static final Log LOG = LogFactory.getLog(CopyCommitter.class);
|
||||
|
||||
private final TaskAttemptContext taskAttemptContext;
|
||||
|
||||
/**
|
||||
* Create a output committer
|
||||
*
|
||||
* @param outputPath the job's output path
|
||||
* @param context the task's context
|
||||
* @throws IOException - Exception if any
|
||||
*/
|
||||
public CopyCommitter(Path outputPath, TaskAttemptContext context) throws IOException {
|
||||
super(outputPath, context);
|
||||
this.taskAttemptContext = context;
|
||||
}
|
||||
|
||||
/** @inheritDoc */
|
||||
@Override
|
||||
public void commitJob(JobContext jobContext) throws IOException {
|
||||
Configuration conf = jobContext.getConfiguration();
|
||||
super.commitJob(jobContext);
|
||||
|
||||
cleanupTempFiles(jobContext);
|
||||
|
||||
String attributes = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
|
||||
if (attributes != null && !attributes.isEmpty()) {
|
||||
preserveFileAttributesForDirectories(conf);
|
||||
}
|
||||
|
||||
try {
|
||||
if (conf.getBoolean(DistCpConstants.CONF_LABEL_DELETE_MISSING, false)) {
|
||||
deleteMissing(conf);
|
||||
} else if (conf.getBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, false)) {
|
||||
commitData(conf);
|
||||
}
|
||||
taskAttemptContext.setStatus("Commit Successful");
|
||||
}
|
||||
finally {
|
||||
cleanup(conf);
|
||||
}
|
||||
}
|
||||
|
||||
/** @inheritDoc */
|
||||
@Override
|
||||
public void abortJob(JobContext jobContext,
|
||||
JobStatus.State state) throws IOException {
|
||||
try {
|
||||
super.abortJob(jobContext, state);
|
||||
} finally {
|
||||
cleanupTempFiles(jobContext);
|
||||
cleanup(jobContext.getConfiguration());
|
||||
}
|
||||
}
|
||||
|
||||
private void cleanupTempFiles(JobContext context) {
|
||||
try {
|
||||
Configuration conf = context.getConfiguration();
|
||||
|
||||
Path targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
|
||||
FileSystem targetFS = targetWorkPath.getFileSystem(conf);
|
||||
|
||||
String jobId = context.getJobID().toString();
|
||||
deleteAttemptTempFiles(targetWorkPath, targetFS, jobId);
|
||||
deleteAttemptTempFiles(targetWorkPath.getParent(), targetFS, jobId);
|
||||
} catch (Throwable t) {
|
||||
LOG.warn("Unable to cleanup temp files", t);
|
||||
}
|
||||
}
|
||||
|
||||
private void deleteAttemptTempFiles(Path targetWorkPath,
|
||||
FileSystem targetFS,
|
||||
String jobId) throws IOException {
|
||||
|
||||
FileStatus[] tempFiles = targetFS.globStatus(
|
||||
new Path(targetWorkPath, ".distcp.tmp." + jobId.replaceAll("job","attempt") + "*"));
|
||||
|
||||
if (tempFiles != null && tempFiles.length > 0) {
|
||||
for (FileStatus file : tempFiles) {
|
||||
LOG.info("Cleaning up " + file.getPath());
|
||||
targetFS.delete(file.getPath(), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup meta folder and other temporary files
|
||||
*
|
||||
* @param conf - Job Configuration
|
||||
*/
|
||||
private void cleanup(Configuration conf) {
|
||||
Path metaFolder = new Path(conf.get(DistCpConstants.CONF_LABEL_META_FOLDER));
|
||||
try {
|
||||
FileSystem fs = metaFolder.getFileSystem(conf);
|
||||
LOG.info("Cleaning up temporary work folder: " + metaFolder);
|
||||
fs.delete(metaFolder, true);
|
||||
} catch (IOException ignore) {
|
||||
LOG.error("Exception encountered ", ignore);
|
||||
}
|
||||
}
|
||||
|
||||
// This method changes the target-directories' file-attributes (owner,
|
||||
// user/group permissions, etc.) based on the corresponding source directories.
|
||||
private void preserveFileAttributesForDirectories(Configuration conf) throws IOException {
|
||||
String attrSymbols = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
|
||||
LOG.info("About to preserve attributes: " + attrSymbols);
|
||||
|
||||
EnumSet<FileAttribute> attributes = DistCpUtils.unpackAttributes(attrSymbols);
|
||||
|
||||
Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
|
||||
FileSystem clusterFS = sourceListing.getFileSystem(conf);
|
||||
SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
|
||||
SequenceFile.Reader.file(sourceListing));
|
||||
long totalLen = clusterFS.getFileStatus(sourceListing).getLen();
|
||||
|
||||
Path targetRoot = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
|
||||
|
||||
long preservedEntries = 0;
|
||||
try {
|
||||
FileStatus srcFileStatus = new FileStatus();
|
||||
Text srcRelPath = new Text();
|
||||
|
||||
// Iterate over every source path that was copied.
|
||||
while (sourceReader.next(srcRelPath, srcFileStatus)) {
|
||||
// File-attributes for files are set at the time of copy,
|
||||
// in the map-task.
|
||||
if (! srcFileStatus.isDirectory()) continue;
|
||||
|
||||
Path targetFile = new Path(targetRoot.toString() + "/" + srcRelPath);
|
||||
|
||||
// Skip the root folder.
|
||||
// Status can't be preserved on root-folder. (E.g. multiple paths may
|
||||
// be copied to a single target folder. Which source-attributes to use
|
||||
// on the target is undefined.)
|
||||
if (targetRoot.equals(targetFile)) continue;
|
||||
|
||||
FileSystem targetFS = targetFile.getFileSystem(conf);
|
||||
DistCpUtils.preserve(targetFS, targetFile, srcFileStatus, attributes);
|
||||
|
||||
taskAttemptContext.progress();
|
||||
taskAttemptContext.setStatus("Preserving status on directory entries. [" +
|
||||
sourceReader.getPosition() * 100 / totalLen + "%]");
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeStream(sourceReader);
|
||||
}
|
||||
LOG.info("Preserved status on " + preservedEntries + " dir entries on target");
|
||||
}
|
||||
|
||||
// This method deletes "extra" files from the target, if they're not
|
||||
// available at the source.
|
||||
private void deleteMissing(Configuration conf) throws IOException {
|
||||
LOG.info("-delete option is enabled. About to remove entries from " +
|
||||
"target that are missing in source");
|
||||
|
||||
// Sort the source-file listing alphabetically.
|
||||
Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
|
||||
FileSystem clusterFS = sourceListing.getFileSystem(conf);
|
||||
Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
|
||||
|
||||
// Similarly, create the listing of target-files. Sort alphabetically.
|
||||
Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
|
||||
CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
|
||||
|
||||
List<Path> targets = new ArrayList<Path>(1);
|
||||
Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
|
||||
targets.add(targetFinalPath);
|
||||
DistCpOptions options = new DistCpOptions(targets, new Path("/NONE"));
|
||||
|
||||
target.buildListing(targetListing, options);
|
||||
Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
|
||||
long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
|
||||
|
||||
SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
|
||||
SequenceFile.Reader.file(sortedSourceListing));
|
||||
SequenceFile.Reader targetReader = new SequenceFile.Reader(conf,
|
||||
SequenceFile.Reader.file(sortedTargetListing));
|
||||
|
||||
// Walk both source and target file listings.
|
||||
// Delete all from target that doesn't also exist on source.
|
||||
long deletedEntries = 0;
|
||||
try {
|
||||
FileStatus srcFileStatus = new FileStatus();
|
||||
Text srcRelPath = new Text();
|
||||
FileStatus trgtFileStatus = new FileStatus();
|
||||
Text trgtRelPath = new Text();
|
||||
|
||||
FileSystem targetFS = targetFinalPath.getFileSystem(conf);
|
||||
boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
|
||||
while (targetReader.next(trgtRelPath, trgtFileStatus)) {
|
||||
// Skip sources that don't exist on target.
|
||||
while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
|
||||
srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
|
||||
}
|
||||
|
||||
if (srcAvailable && trgtRelPath.equals(srcRelPath)) continue;
|
||||
|
||||
// Target doesn't exist at source. Delete.
|
||||
boolean result = (!targetFS.exists(trgtFileStatus.getPath()) ||
|
||||
targetFS.delete(trgtFileStatus.getPath(), true));
|
||||
if (result) {
|
||||
LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
|
||||
deletedEntries++;
|
||||
} else {
|
||||
throw new IOException("Unable to delete " + trgtFileStatus.getPath());
|
||||
}
|
||||
taskAttemptContext.progress();
|
||||
taskAttemptContext.setStatus("Deleting missing files from target. [" +
|
||||
targetReader.getPosition() * 100 / totalLen + "%]");
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeStream(sourceReader);
|
||||
IOUtils.closeStream(targetReader);
|
||||
}
|
||||
LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
|
||||
}
|
||||
|
||||
private void commitData(Configuration conf) throws IOException {
|
||||
|
||||
Path workDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
|
||||
Path finalDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
|
||||
FileSystem targetFS = workDir.getFileSystem(conf);
|
||||
|
||||
LOG.info("Atomic commit enabled. Moving " + workDir + " to " + finalDir);
|
||||
if (targetFS.exists(finalDir) && targetFS.exists(workDir)) {
|
||||
LOG.error("Pre-existing final-path found at: " + finalDir);
|
||||
throw new IOException("Target-path can't be committed to because it " +
|
||||
"exists at " + finalDir + ". Copied data is in temp-dir: " + workDir + ". ");
|
||||
}
|
||||
|
||||
boolean result = targetFS.rename(workDir, finalDir);
|
||||
if (!result) {
|
||||
LOG.warn("Rename failed. Perhaps data already moved. Verifying...");
|
||||
result = targetFS.exists(finalDir) && !targetFS.exists(workDir);
|
||||
}
|
||||
if (result) {
|
||||
LOG.info("Data committed successfully to " + finalDir);
|
||||
taskAttemptContext.setStatus("Data committed successfully to " + finalDir);
|
||||
} else {
|
||||
LOG.error("Unable to commit data to " + finalDir);
|
||||
throw new IOException("Atomic commit failed. Temporary data in " + workDir +
|
||||
", Unable to move to " + finalDir);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,330 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools.mapred;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.JobContext;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.apache.hadoop.tools.DistCpConstants;
|
||||
import org.apache.hadoop.tools.DistCpOptionSwitch;
|
||||
import org.apache.hadoop.tools.DistCpOptions;
|
||||
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Mapper class that executes the DistCp copy operation.
|
||||
* Implements the o.a.h.mapreduce.Mapper<> interface.
|
||||
*/
|
||||
public class CopyMapper extends Mapper<Text, FileStatus, Text, Text> {
|
||||
|
||||
/**
|
||||
* Hadoop counters for the DistCp CopyMapper.
|
||||
* (These have been kept identical to the old DistCp,
|
||||
* for backward compatibility.)
|
||||
*/
|
||||
public static enum Counter {
|
||||
COPY, // Number of files received by the mapper for copy.
|
||||
SKIP, // Number of files skipped.
|
||||
FAIL, // Number of files that failed to be copied.
|
||||
BYTESCOPIED, // Number of bytes actually copied by the copy-mapper, total.
|
||||
BYTESEXPECTED,// Number of bytes expected to be copied.
|
||||
BYTESFAILED, // Number of bytes that failed to be copied.
|
||||
BYTESSKIPPED, // Number of bytes that were skipped from copy.
|
||||
}
|
||||
|
||||
private static Log LOG = LogFactory.getLog(CopyMapper.class);
|
||||
|
||||
private Configuration conf;
|
||||
|
||||
private boolean syncFolders = false;
|
||||
private boolean ignoreFailures = false;
|
||||
private boolean skipCrc = false;
|
||||
private boolean overWrite = false;
|
||||
private EnumSet<FileAttribute> preserve = EnumSet.noneOf(FileAttribute.class);
|
||||
|
||||
private FileSystem targetFS = null;
|
||||
private Path targetWorkPath = null;
|
||||
|
||||
/**
|
||||
* Implementation of the Mapper::setup() method. This extracts the DistCp-
|
||||
* options specified in the Job's configuration, to set up the Job.
|
||||
* @param context Mapper's context.
|
||||
* @throws IOException On IO failure.
|
||||
* @throws InterruptedException If the job is interrupted.
|
||||
*/
|
||||
@Override
|
||||
public void setup(Context context) throws IOException, InterruptedException {
|
||||
conf = context.getConfiguration();
|
||||
|
||||
syncFolders = conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false);
|
||||
ignoreFailures = conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false);
|
||||
skipCrc = conf.getBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), false);
|
||||
overWrite = conf.getBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), false);
|
||||
preserve = DistCpUtils.unpackAttributes(conf.get(DistCpOptionSwitch.
|
||||
PRESERVE_STATUS.getConfigLabel()));
|
||||
|
||||
targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
|
||||
Path targetFinalPath = new Path(conf.get(
|
||||
DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
|
||||
targetFS = targetFinalPath.getFileSystem(conf);
|
||||
|
||||
if (targetFS.exists(targetFinalPath) && targetFS.isFile(targetFinalPath)) {
|
||||
overWrite = true; // When target is an existing file, overwrite it.
|
||||
}
|
||||
|
||||
if (conf.get(DistCpConstants.CONF_LABEL_SSL_CONF) != null) {
|
||||
initializeSSLConf(context);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize SSL Config if same is set in conf
|
||||
*
|
||||
* @throws IOException - If any
|
||||
*/
|
||||
private void initializeSSLConf(Context context) throws IOException {
|
||||
LOG.info("Initializing SSL configuration");
|
||||
|
||||
String workDir = conf.get(JobContext.JOB_LOCAL_DIR) + "/work";
|
||||
Path[] cacheFiles = context.getLocalCacheFiles();
|
||||
|
||||
Configuration sslConfig = new Configuration(false);
|
||||
String sslConfFileName = conf.get(DistCpConstants.CONF_LABEL_SSL_CONF);
|
||||
Path sslClient = findCacheFile(cacheFiles, sslConfFileName);
|
||||
if (sslClient == null) {
|
||||
LOG.warn("SSL Client config file not found. Was looking for " + sslConfFileName +
|
||||
" in " + Arrays.toString(cacheFiles));
|
||||
return;
|
||||
}
|
||||
sslConfig.addResource(sslClient);
|
||||
|
||||
String trustStoreFile = conf.get("ssl.client.truststore.location");
|
||||
Path trustStorePath = findCacheFile(cacheFiles, trustStoreFile);
|
||||
sslConfig.set("ssl.client.truststore.location", trustStorePath.toString());
|
||||
|
||||
String keyStoreFile = conf.get("ssl.client.keystore.location");
|
||||
Path keyStorePath = findCacheFile(cacheFiles, keyStoreFile);
|
||||
sslConfig.set("ssl.client.keystore.location", keyStorePath.toString());
|
||||
|
||||
try {
|
||||
OutputStream out = new FileOutputStream(workDir + "/" + sslConfFileName);
|
||||
try {
|
||||
sslConfig.writeXml(out);
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
conf.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfFileName);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Unable to write out the ssl configuration. " +
|
||||
"Will fall back to default ssl-client.xml in class path, if there is one", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find entry from distributed cache
|
||||
*
|
||||
* @param cacheFiles - All localized cache files
|
||||
* @param fileName - fileName to search
|
||||
* @return Path of the filename if found, else null
|
||||
*/
|
||||
private Path findCacheFile(Path[] cacheFiles, String fileName) {
|
||||
if (cacheFiles != null && cacheFiles.length > 0) {
|
||||
for (Path file : cacheFiles) {
|
||||
if (file.getName().equals(fileName)) {
|
||||
return file;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of the Mapper<>::map(). Does the copy.
|
||||
* @param relPath The target path.
|
||||
* @param sourceFileStatus The source path.
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public void map(Text relPath, FileStatus sourceFileStatus, Context context)
|
||||
throws IOException, InterruptedException {
|
||||
Path sourcePath = sourceFileStatus.getPath();
|
||||
|
||||
if (LOG.isDebugEnabled())
|
||||
LOG.debug("DistCpMapper::map(): Received " + sourcePath + ", " + relPath);
|
||||
|
||||
Path target = new Path(targetWorkPath.makeQualified(targetFS.getUri(),
|
||||
targetFS.getWorkingDirectory()) + relPath.toString());
|
||||
|
||||
EnumSet<DistCpOptions.FileAttribute> fileAttributes
|
||||
= getFileAttributeSettings(context);
|
||||
|
||||
final String description = "Copying " + sourcePath + " to " + target;
|
||||
context.setStatus(description);
|
||||
|
||||
LOG.info(description);
|
||||
|
||||
try {
|
||||
FileStatus sourceCurrStatus;
|
||||
FileSystem sourceFS;
|
||||
try {
|
||||
sourceFS = sourcePath.getFileSystem(conf);
|
||||
sourceCurrStatus = sourceFS.getFileStatus(sourcePath);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new IOException(new RetriableFileCopyCommand.CopyReadException(e));
|
||||
}
|
||||
|
||||
FileStatus targetStatus = null;
|
||||
|
||||
try {
|
||||
targetStatus = targetFS.getFileStatus(target);
|
||||
} catch (FileNotFoundException ignore) {
|
||||
if (LOG.isDebugEnabled())
|
||||
LOG.debug("Path could not be found: " + target, ignore);
|
||||
}
|
||||
|
||||
if (targetStatus != null && (targetStatus.isDirectory() != sourceCurrStatus.isDirectory())) {
|
||||
throw new IOException("Can't replace " + target + ". Target is " +
|
||||
getFileType(targetStatus) + ", Source is " + getFileType(sourceCurrStatus));
|
||||
}
|
||||
|
||||
if (sourceCurrStatus.isDirectory()) {
|
||||
createTargetDirsWithRetry(description, target, context);
|
||||
return;
|
||||
}
|
||||
|
||||
if (skipFile(sourceFS, sourceCurrStatus, target)) {
|
||||
LOG.info("Skipping copy of " + sourceCurrStatus.getPath()
|
||||
+ " to " + target);
|
||||
updateSkipCounters(context, sourceCurrStatus);
|
||||
context.write(null, new Text("SKIP: " + sourceCurrStatus.getPath()));
|
||||
}
|
||||
else {
|
||||
copyFileWithRetry(description, sourceCurrStatus, target, context,
|
||||
fileAttributes);
|
||||
}
|
||||
|
||||
DistCpUtils.preserve(target.getFileSystem(conf), target,
|
||||
sourceCurrStatus, fileAttributes);
|
||||
|
||||
} catch (IOException exception) {
|
||||
handleFailures(exception, sourceFileStatus, target, context);
|
||||
}
|
||||
}
|
||||
|
||||
private String getFileType(FileStatus fileStatus) {
|
||||
return fileStatus == null ? "N/A" : (fileStatus.isDirectory() ? "dir" : "file");
|
||||
}
|
||||
|
||||
private static EnumSet<DistCpOptions.FileAttribute>
|
||||
getFileAttributeSettings(Mapper.Context context) {
|
||||
String attributeString = context.getConfiguration().get(
|
||||
DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel());
|
||||
return DistCpUtils.unpackAttributes(attributeString);
|
||||
}
|
||||
|
||||
private void copyFileWithRetry(String description, FileStatus sourceFileStatus,
|
||||
Path target, Context context,
|
||||
EnumSet<DistCpOptions.FileAttribute> fileAttributes) throws IOException {
|
||||
|
||||
long bytesCopied;
|
||||
try {
|
||||
bytesCopied = (Long)new RetriableFileCopyCommand(description)
|
||||
.execute(sourceFileStatus, target, context, fileAttributes);
|
||||
} catch (Exception e) {
|
||||
context.setStatus("Copy Failure: " + sourceFileStatus.getPath());
|
||||
throw new IOException("File copy failed: " + sourceFileStatus.getPath() +
|
||||
" --> " + target, e);
|
||||
}
|
||||
incrementCounter(context, Counter.BYTESEXPECTED, sourceFileStatus.getLen());
|
||||
incrementCounter(context, Counter.BYTESCOPIED, bytesCopied);
|
||||
incrementCounter(context, Counter.COPY, 1);
|
||||
}
|
||||
|
||||
private void createTargetDirsWithRetry(String description,
|
||||
Path target, Context context) throws IOException {
|
||||
try {
|
||||
new RetriableDirectoryCreateCommand(description).execute(target, context);
|
||||
} catch (Exception e) {
|
||||
throw new IOException("mkdir failed for " + target, e);
|
||||
}
|
||||
incrementCounter(context, Counter.COPY, 1);
|
||||
}
|
||||
|
||||
private static void updateSkipCounters(Context context,
|
||||
FileStatus sourceFile) {
|
||||
incrementCounter(context, Counter.SKIP, 1);
|
||||
incrementCounter(context, Counter.BYTESSKIPPED, sourceFile.getLen());
|
||||
|
||||
}
|
||||
|
||||
private void handleFailures(IOException exception,
|
||||
FileStatus sourceFileStatus, Path target,
|
||||
Context context) throws IOException, InterruptedException {
|
||||
LOG.error("Failure in copying " + sourceFileStatus.getPath() + " to " +
|
||||
target, exception);
|
||||
|
||||
if (ignoreFailures && exception.getCause() instanceof
|
||||
RetriableFileCopyCommand.CopyReadException) {
|
||||
incrementCounter(context, Counter.FAIL, 1);
|
||||
incrementCounter(context, Counter.BYTESFAILED, sourceFileStatus.getLen());
|
||||
context.write(null, new Text("FAIL: " + sourceFileStatus.getPath() + " - " +
|
||||
StringUtils.stringifyException(exception)));
|
||||
}
|
||||
else
|
||||
throw exception;
|
||||
}
|
||||
|
||||
private static void incrementCounter(Context context, Counter counter,
|
||||
long value) {
|
||||
context.getCounter(counter).increment(value);
|
||||
}
|
||||
|
||||
private boolean skipFile(FileSystem sourceFS, FileStatus source, Path target)
|
||||
throws IOException {
|
||||
return targetFS.exists(target)
|
||||
&& !overWrite
|
||||
&& !mustUpdate(sourceFS, source, target);
|
||||
}
|
||||
|
||||
private boolean mustUpdate(FileSystem sourceFS, FileStatus source, Path target)
|
||||
throws IOException {
|
||||
final FileStatus targetFileStatus = targetFS.getFileStatus(target);
|
||||
|
||||
return syncFolders
|
||||
&& (
|
||||
targetFileStatus.getLen() != source.getLen()
|
||||
|| (!skipCrc &&
|
||||
!DistCpUtils.checksumsAreEqual(sourceFS,
|
||||
source.getPath(), targetFS, target))
|
||||
|| (source.getBlockSize() != targetFileStatus.getBlockSize() &&
|
||||
preserve.contains(FileAttribute.BLOCKSIZE))
|
||||
);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools.mapred;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapreduce.*;
|
||||
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
|
||||
import org.apache.hadoop.mapreduce.security.TokenCache;
|
||||
import org.apache.hadoop.tools.DistCpConstants;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* The CopyOutputFormat is the Hadoop OutputFormat used in DistCp.
|
||||
* It sets up the Job's Configuration (in the Job-Context) with the settings
|
||||
* for the work-directory, final commit-directory, etc. It also sets the right
|
||||
* output-committer.
|
||||
* @param <K>
|
||||
* @param <V>
|
||||
*/
|
||||
public class CopyOutputFormat<K, V> extends TextOutputFormat<K, V> {
|
||||
|
||||
/**
|
||||
* Setter for the working directory for DistCp (where files will be copied
|
||||
* before they are moved to the final commit-directory.)
|
||||
* @param job The Job on whose configuration the working-directory is to be set.
|
||||
* @param workingDirectory The path to use as the working directory.
|
||||
*/
|
||||
public static void setWorkingDirectory(Job job, Path workingDirectory) {
|
||||
job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH,
|
||||
workingDirectory.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter for the final directory for DistCp (where files copied will be
|
||||
* moved, atomically.)
|
||||
* @param job The Job on whose configuration the working-directory is to be set.
|
||||
* @param commitDirectory The path to use for final commit.
|
||||
*/
|
||||
public static void setCommitDirectory(Job job, Path commitDirectory) {
|
||||
job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH,
|
||||
commitDirectory.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for the working directory.
|
||||
* @param job The Job from whose configuration the working-directory is to
|
||||
* be retrieved.
|
||||
* @return The working-directory Path.
|
||||
*/
|
||||
public static Path getWorkingDirectory(Job job) {
|
||||
return getWorkingDirectory(job.getConfiguration());
|
||||
}
|
||||
|
||||
private static Path getWorkingDirectory(Configuration conf) {
|
||||
String workingDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH);
|
||||
if (workingDirectory == null || workingDirectory.isEmpty()) {
|
||||
return null;
|
||||
} else {
|
||||
return new Path(workingDirectory);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for the final commit-directory.
|
||||
* @param job The Job from whose configuration the commit-directory is to be
|
||||
* retrieved.
|
||||
* @return The commit-directory Path.
|
||||
*/
|
||||
public static Path getCommitDirectory(Job job) {
|
||||
return getCommitDirectory(job.getConfiguration());
|
||||
}
|
||||
|
||||
private static Path getCommitDirectory(Configuration conf) {
|
||||
String commitDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH);
|
||||
if (commitDirectory == null || commitDirectory.isEmpty()) {
|
||||
return null;
|
||||
} else {
|
||||
return new Path(commitDirectory);
|
||||
}
|
||||
}
|
||||
|
||||
/** @inheritDoc */
|
||||
@Override
|
||||
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
|
||||
return new CopyCommitter(getOutputPath(context), context);
|
||||
}
|
||||
|
||||
/** @inheritDoc */
|
||||
@Override
|
||||
public void checkOutputSpecs(JobContext context) throws IOException {
|
||||
Configuration conf = context.getConfiguration();
|
||||
|
||||
if (getCommitDirectory(conf) == null) {
|
||||
throw new IllegalStateException("Commit directory not configured");
|
||||
}
|
||||
|
||||
Path workingPath = getWorkingDirectory(conf);
|
||||
if (workingPath == null) {
|
||||
throw new IllegalStateException("Working directory not configured");
|
||||
}
|
||||
|
||||
// get delegation token for outDir's file system
|
||||
TokenCache.obtainTokensForNamenodes(context.getCredentials(),
|
||||
new Path[] {workingPath}, conf);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools.mapred;
|
||||
|
||||
import org.apache.hadoop.tools.util.RetriableCommand;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
|
||||
/**
|
||||
* This class extends Retriable command to implement the creation of directories
|
||||
* with retries on failure.
|
||||
*/
|
||||
public class RetriableDirectoryCreateCommand extends RetriableCommand {
|
||||
|
||||
/**
|
||||
* Constructor, taking a description of the action.
|
||||
* @param description Verbose description of the copy operation.
|
||||
*/
|
||||
public RetriableDirectoryCreateCommand(String description) {
|
||||
super(description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of RetriableCommand::doExecute().
|
||||
* This implements the actual mkdirs() functionality.
|
||||
* @param arguments Argument-list to the command.
|
||||
* @return Boolean. True, if the directory could be created successfully.
|
||||
* @throws Exception IOException, on failure to create the directory.
|
||||
*/
|
||||
@Override
|
||||
protected Object doExecute(Object... arguments) throws Exception {
|
||||
assert arguments.length == 2 : "Unexpected argument list.";
|
||||
Path target = (Path)arguments[0];
|
||||
Mapper.Context context = (Mapper.Context)arguments[1];
|
||||
|
||||
FileSystem targetFS = target.getFileSystem(context.getConfiguration());
|
||||
return targetFS.mkdirs(target);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,245 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools.mapred;
|
||||
|
||||
import org.apache.hadoop.tools.util.RetriableCommand;
|
||||
import org.apache.hadoop.tools.util.ThrottledInputStream;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.tools.DistCpOptions.*;
|
||||
import org.apache.hadoop.tools.DistCpConstants;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.EnumSet;
|
||||
|
||||
/**
|
||||
* This class extends RetriableCommand to implement the copy of files,
|
||||
* with retries on failure.
|
||||
*/
|
||||
public class RetriableFileCopyCommand extends RetriableCommand {
|
||||
|
||||
private static Log LOG = LogFactory.getLog(RetriableFileCopyCommand.class);
|
||||
private static int BUFFER_SIZE = 8 * 1024;
|
||||
|
||||
/**
|
||||
* Constructor, taking a description of the action.
|
||||
* @param description Verbose description of the copy operation.
|
||||
*/
|
||||
public RetriableFileCopyCommand(String description) {
|
||||
super(description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of RetriableCommand::doExecute().
|
||||
* This is the actual copy-implementation.
|
||||
* @param arguments Argument-list to the command.
|
||||
* @return Number of bytes copied.
|
||||
* @throws Exception: CopyReadException, if there are read-failures. All other
|
||||
* failures are IOExceptions.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
protected Object doExecute(Object... arguments) throws Exception {
|
||||
assert arguments.length == 4 : "Unexpected argument list.";
|
||||
FileStatus source = (FileStatus)arguments[0];
|
||||
assert !source.isDirectory() : "Unexpected file-status. Expected file.";
|
||||
Path target = (Path)arguments[1];
|
||||
Mapper.Context context = (Mapper.Context)arguments[2];
|
||||
EnumSet<FileAttribute> fileAttributes
|
||||
= (EnumSet<FileAttribute>)arguments[3];
|
||||
return doCopy(source, target, context, fileAttributes);
|
||||
}
|
||||
|
||||
private long doCopy(FileStatus sourceFileStatus, Path target,
|
||||
Mapper.Context context,
|
||||
EnumSet<FileAttribute> fileAttributes)
|
||||
throws IOException {
|
||||
|
||||
Path tmpTargetPath = getTmpFile(target, context);
|
||||
final Configuration configuration = context.getConfiguration();
|
||||
FileSystem targetFS = target.getFileSystem(configuration);
|
||||
|
||||
try {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Copying " + sourceFileStatus.getPath() + " to " + target);
|
||||
LOG.debug("Tmp-file path: " + tmpTargetPath);
|
||||
}
|
||||
FileSystem sourceFS = sourceFileStatus.getPath().getFileSystem(
|
||||
configuration);
|
||||
long bytesRead = copyToTmpFile(tmpTargetPath, targetFS, sourceFileStatus,
|
||||
context, fileAttributes);
|
||||
|
||||
compareFileLengths(sourceFileStatus, tmpTargetPath, configuration, bytesRead);
|
||||
compareCheckSums(sourceFS, sourceFileStatus.getPath(), targetFS, tmpTargetPath);
|
||||
promoteTmpToTarget(tmpTargetPath, target, targetFS);
|
||||
return bytesRead;
|
||||
|
||||
} finally {
|
||||
if (targetFS.exists(tmpTargetPath))
|
||||
targetFS.delete(tmpTargetPath, false);
|
||||
}
|
||||
}
|
||||
|
||||
private long copyToTmpFile(Path tmpTargetPath, FileSystem targetFS,
|
||||
FileStatus sourceFileStatus, Mapper.Context context,
|
||||
EnumSet<FileAttribute> fileAttributes)
|
||||
throws IOException {
|
||||
OutputStream outStream = new BufferedOutputStream(targetFS.create(
|
||||
tmpTargetPath, true, BUFFER_SIZE,
|
||||
getReplicationFactor(fileAttributes, sourceFileStatus, targetFS),
|
||||
getBlockSize(fileAttributes, sourceFileStatus, targetFS), context));
|
||||
return copyBytes(sourceFileStatus, outStream, BUFFER_SIZE, true, context);
|
||||
}
|
||||
|
||||
private void compareFileLengths(FileStatus sourceFileStatus, Path target,
|
||||
Configuration configuration, long bytesRead)
|
||||
throws IOException {
|
||||
final Path sourcePath = sourceFileStatus.getPath();
|
||||
FileSystem fs = sourcePath.getFileSystem(configuration);
|
||||
if (fs.getFileStatus(sourcePath).getLen() != bytesRead)
|
||||
throw new IOException("Mismatch in length of source:" + sourcePath
|
||||
+ " and target:" + target);
|
||||
}
|
||||
|
||||
private void compareCheckSums(FileSystem sourceFS, Path source,
|
||||
FileSystem targetFS, Path target)
|
||||
throws IOException {
|
||||
if (!DistCpUtils.checksumsAreEqual(sourceFS, source, targetFS, target))
|
||||
throw new IOException("Check-sum mismatch between "
|
||||
+ source + " and " + target);
|
||||
|
||||
}
|
||||
|
||||
//If target file exists and unable to delete target - fail
|
||||
//If target doesn't exist and unable to create parent folder - fail
|
||||
//If target is successfully deleted and parent exists, if rename fails - fail
|
||||
private void promoteTmpToTarget(Path tmpTarget, Path target, FileSystem fs)
|
||||
throws IOException {
|
||||
if ((fs.exists(target) && !fs.delete(target, false))
|
||||
|| (!fs.exists(target.getParent()) && !fs.mkdirs(target.getParent()))
|
||||
|| !fs.rename(tmpTarget, target)) {
|
||||
throw new IOException("Failed to promote tmp-file:" + tmpTarget
|
||||
+ " to: " + target);
|
||||
}
|
||||
}
|
||||
|
||||
private Path getTmpFile(Path target, Mapper.Context context) {
|
||||
Path targetWorkPath = new Path(context.getConfiguration().
|
||||
get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
|
||||
|
||||
Path root = target.equals(targetWorkPath)? targetWorkPath.getParent() : targetWorkPath;
|
||||
LOG.info("Creating temp file: " +
|
||||
new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString()));
|
||||
return new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString());
|
||||
}
|
||||
|
||||
private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream,
|
||||
int bufferSize, boolean mustCloseStream,
|
||||
Mapper.Context context) throws IOException {
|
||||
Path source = sourceFileStatus.getPath();
|
||||
byte buf[] = new byte[bufferSize];
|
||||
ThrottledInputStream inStream = null;
|
||||
long totalBytesRead = 0;
|
||||
|
||||
try {
|
||||
inStream = getInputStream(source, context.getConfiguration());
|
||||
int bytesRead = readBytes(inStream, buf);
|
||||
while (bytesRead >= 0) {
|
||||
totalBytesRead += bytesRead;
|
||||
outStream.write(buf, 0, bytesRead);
|
||||
updateContextStatus(totalBytesRead, context, sourceFileStatus);
|
||||
bytesRead = inStream.read(buf);
|
||||
}
|
||||
} finally {
|
||||
if (mustCloseStream)
|
||||
IOUtils.cleanup(LOG, outStream, inStream);
|
||||
}
|
||||
|
||||
return totalBytesRead;
|
||||
}
|
||||
|
||||
private void updateContextStatus(long totalBytesRead, Mapper.Context context,
|
||||
FileStatus sourceFileStatus) {
|
||||
StringBuilder message = new StringBuilder(DistCpUtils.getFormatter()
|
||||
.format(totalBytesRead * 100.0f / sourceFileStatus.getLen()));
|
||||
message.append("% ")
|
||||
.append(description).append(" [")
|
||||
.append(DistCpUtils.getStringDescriptionFor(totalBytesRead))
|
||||
.append('/')
|
||||
.append(DistCpUtils.getStringDescriptionFor(sourceFileStatus.getLen()))
|
||||
.append(']');
|
||||
context.setStatus(message.toString());
|
||||
}
|
||||
|
||||
private static int readBytes(InputStream inStream, byte buf[])
|
||||
throws IOException {
|
||||
try {
|
||||
return inStream.read(buf);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new CopyReadException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static ThrottledInputStream getInputStream(Path path, Configuration conf)
|
||||
throws IOException {
|
||||
try {
|
||||
FileSystem fs = path.getFileSystem(conf);
|
||||
long bandwidthMB = conf.getInt(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
|
||||
DistCpConstants.DEFAULT_BANDWIDTH_MB);
|
||||
return new ThrottledInputStream(new BufferedInputStream(fs.open(path)),
|
||||
bandwidthMB * 1024 * 1024);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new CopyReadException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static short getReplicationFactor(
|
||||
EnumSet<FileAttribute> fileAttributes,
|
||||
FileStatus sourceFile, FileSystem targetFS) {
|
||||
return fileAttributes.contains(FileAttribute.REPLICATION)?
|
||||
sourceFile.getReplication() : targetFS.getDefaultReplication();
|
||||
}
|
||||
|
||||
private static long getBlockSize(
|
||||
EnumSet<FileAttribute> fileAttributes,
|
||||
FileStatus sourceFile, FileSystem targetFS) {
|
||||
return fileAttributes.contains(FileAttribute.BLOCKSIZE)?
|
||||
sourceFile.getBlockSize() : targetFS.getDefaultBlockSize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Special subclass of IOException. This is used to distinguish read-operation
|
||||
* failures from other kinds of IOExceptions.
|
||||
* The failure to read from source is dealt with specially, in the CopyMapper.
|
||||
* Such failures may be skipped if the DistCpOptions indicate so.
|
||||
* Write failures are intolerable, and amount to CopyMapper failure.
|
||||
*/
|
||||
public static class CopyReadException extends IOException {
|
||||
public CopyReadException(Throwable rootCause) {
|
||||
super(rootCause);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools.mapred;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.tools.DistCpConstants;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.mapreduce.*;
|
||||
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* UniformSizeInputFormat extends the InputFormat<> class, to produce
|
||||
* input-splits for DistCp.
|
||||
* It looks at the copy-listing and groups the contents into input-splits such
|
||||
* that the total-number of bytes to be copied for each input split is
|
||||
* uniform.
|
||||
*/
|
||||
public class UniformSizeInputFormat extends InputFormat<Text, FileStatus> {
|
||||
private static final Log LOG
|
||||
= LogFactory.getLog(UniformSizeInputFormat.class);
|
||||
|
||||
/**
|
||||
* Implementation of InputFormat::getSplits(). Returns a list of InputSplits,
|
||||
* such that the number of bytes to be copied for all the splits are
|
||||
* approximately equal.
|
||||
* @param context JobContext for the job.
|
||||
* @return The list of uniformly-distributed input-splits.
|
||||
* @throws IOException: On failure.
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
@Override
|
||||
public List<InputSplit> getSplits(JobContext context)
|
||||
throws IOException, InterruptedException {
|
||||
Configuration configuration = context.getConfiguration();
|
||||
int numSplits = DistCpUtils.getInt(configuration,
|
||||
JobContext.NUM_MAPS);
|
||||
|
||||
if (numSplits == 0) return new ArrayList<InputSplit>();
|
||||
|
||||
return getSplits(configuration, numSplits,
|
||||
DistCpUtils.getLong(configuration,
|
||||
DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED));
|
||||
}
|
||||
|
||||
private List<InputSplit> getSplits(Configuration configuration, int numSplits,
|
||||
long totalSizeBytes) throws IOException {
|
||||
List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
|
||||
long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
|
||||
|
||||
FileStatus srcFileStatus = new FileStatus();
|
||||
Text srcRelPath = new Text();
|
||||
long currentSplitSize = 0;
|
||||
long lastSplitStart = 0;
|
||||
long lastPosition = 0;
|
||||
|
||||
final Path listingFilePath = getListingFilePath(configuration);
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Average bytes per map: " + nBytesPerSplit +
|
||||
", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
|
||||
}
|
||||
SequenceFile.Reader reader=null;
|
||||
try {
|
||||
reader = getListingFileReader(configuration);
|
||||
while (reader.next(srcRelPath, srcFileStatus)) {
|
||||
// If adding the current file would cause the bytes per map to exceed
|
||||
// limit. Add the current file to new split
|
||||
if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
|
||||
FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
|
||||
lastPosition - lastSplitStart, null);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
|
||||
}
|
||||
splits.add(split);
|
||||
lastSplitStart = lastPosition;
|
||||
currentSplitSize = 0;
|
||||
}
|
||||
currentSplitSize += srcFileStatus.getLen();
|
||||
lastPosition = reader.getPosition();
|
||||
}
|
||||
if (lastPosition > lastSplitStart) {
|
||||
FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
|
||||
lastPosition - lastSplitStart, null);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.info ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
|
||||
}
|
||||
splits.add(split);
|
||||
}
|
||||
|
||||
} finally {
|
||||
IOUtils.closeStream(reader);
|
||||
}
|
||||
|
||||
return splits;
|
||||
}
|
||||
|
||||
private static Path getListingFilePath(Configuration configuration) {
|
||||
final String listingFilePathString =
|
||||
configuration.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
|
||||
|
||||
assert !listingFilePathString.equals("")
|
||||
: "Couldn't find listing file. Invalid input.";
|
||||
return new Path(listingFilePathString);
|
||||
}
|
||||
|
||||
private SequenceFile.Reader getListingFileReader(Configuration configuration) {
|
||||
|
||||
final Path listingFilePath = getListingFilePath(configuration);
|
||||
try {
|
||||
final FileSystem fileSystem = listingFilePath.getFileSystem(configuration);
|
||||
if (!fileSystem.exists(listingFilePath))
|
||||
throw new IllegalArgumentException("Listing file doesn't exist at: "
|
||||
+ listingFilePath);
|
||||
|
||||
return new SequenceFile.Reader(configuration,
|
||||
SequenceFile.Reader.file(listingFilePath));
|
||||
}
|
||||
catch (IOException exception) {
|
||||
LOG.error("Couldn't find listing file at: " + listingFilePath, exception);
|
||||
throw new IllegalArgumentException("Couldn't find listing-file at: "
|
||||
+ listingFilePath, exception);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of InputFormat::createRecordReader().
|
||||
* @param split The split for which the RecordReader is sought.
|
||||
* @param context The context of the current task-attempt.
|
||||
* @return A SequenceFileRecordReader instance, (since the copy-listing is a
|
||||
* simple sequence-file.)
|
||||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
@Override
|
||||
public RecordReader<Text, FileStatus> createRecordReader(InputSplit split,
|
||||
TaskAttemptContext context)
|
||||
throws IOException, InterruptedException {
|
||||
return new SequenceFileRecordReader<Text, FileStatus>();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,246 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.tools.mapred.lib;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.tools.DistCpConstants;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
|
||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||
import org.apache.hadoop.mapreduce.TaskID;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* The DynamicInputChunk represents a single chunk of work, when used in
|
||||
* conjunction with the DynamicInputFormat and the DynamicRecordReader.
|
||||
* The records in the DynamicInputFormat's input-file are split across various
|
||||
* DynamicInputChunks. Each one is claimed and processed in an iteration of
|
||||
* a dynamic-mapper. When a DynamicInputChunk has been exhausted, the faster
|
||||
* mapper may claim another and process it, until there are no more to be
|
||||
* consumed.
|
||||
*/
|
||||
class DynamicInputChunk<K, V> {
|
||||
private static Log LOG = LogFactory.getLog(DynamicInputChunk.class);
|
||||
|
||||
private static Configuration configuration;
|
||||
private static Path chunkRootPath;
|
||||
private static String chunkFilePrefix;
|
||||
private static int numChunksLeft = -1; // Un-initialized before 1st dir-scan.
|
||||
private static FileSystem fs;
|
||||
|
||||
private Path chunkFilePath;
|
||||
private SequenceFileRecordReader<K, V> reader;
|
||||
private SequenceFile.Writer writer;
|
||||
|
||||
private static void initializeChunkInvariants(Configuration config)
|
||||
throws IOException {
|
||||
configuration = config;
|
||||
Path listingFilePath = new Path(getListingFilePath(configuration));
|
||||
chunkRootPath = new Path(listingFilePath.getParent(), "chunkDir");
|
||||
fs = chunkRootPath.getFileSystem(configuration);
|
||||
chunkFilePrefix = listingFilePath.getName() + ".chunk.";
|
||||
}
|
||||
|
||||
private static String getListingFilePath(Configuration configuration) {
|
||||
final String listingFileString = configuration.get(
|
||||
DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
|
||||
assert !listingFileString.equals("") : "Listing file not found.";
|
||||
return listingFileString;
|
||||
}
|
||||
|
||||
private static boolean areInvariantsInitialized() {
|
||||
return chunkRootPath != null;
|
||||
}
|
||||
|
||||
private DynamicInputChunk(String chunkId, Configuration configuration)
|
||||
throws IOException {
|
||||
if (!areInvariantsInitialized())
|
||||
initializeChunkInvariants(configuration);
|
||||
|
||||
chunkFilePath = new Path(chunkRootPath, chunkFilePrefix + chunkId);
|
||||
openForWrite();
|
||||
}
|
||||
|
||||
|
||||
private void openForWrite() throws IOException {
|
||||
writer = SequenceFile.createWriter(
|
||||
chunkFilePath.getFileSystem(configuration), configuration,
|
||||
chunkFilePath, Text.class, FileStatus.class,
|
||||
SequenceFile.CompressionType.NONE);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method to create chunk-files for writing to.
|
||||
* (For instance, when the DynamicInputFormat splits the input-file into
|
||||
* chunks.)
|
||||
* @param chunkId String to identify the chunk.
|
||||
* @param configuration Configuration, describing the location of the listing-
|
||||
* file, file-system for the map-job, etc.
|
||||
* @return A DynamicInputChunk, corresponding to a chunk-file, with the name
|
||||
* incorporating the chunk-id.
|
||||
* @throws IOException Exception on failure to create the chunk.
|
||||
*/
|
||||
public static DynamicInputChunk createChunkForWrite(String chunkId,
|
||||
Configuration configuration) throws IOException {
|
||||
return new DynamicInputChunk(chunkId, configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to write records into a chunk.
|
||||
* @param key Key from the listing file.
|
||||
* @param value Corresponding value from the listing file.
|
||||
* @throws IOException Exception onf failure to write to the file.
|
||||
*/
|
||||
public void write(Text key, FileStatus value) throws IOException {
|
||||
writer.append(key, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes streams opened to the chunk-file.
|
||||
*/
|
||||
public void close() {
|
||||
IOUtils.cleanup(LOG, reader, writer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reassigns the chunk to a specified Map-Task, for consumption.
|
||||
* @param taskId The Map-Task to which a the chunk is to be reassigned.
|
||||
* @throws IOException Exception on failure to reassign.
|
||||
*/
|
||||
public void assignTo(TaskID taskId) throws IOException {
|
||||
Path newPath = new Path(chunkRootPath, taskId.toString());
|
||||
if (!fs.rename(chunkFilePath, newPath)) {
|
||||
LOG.warn(chunkFilePath + " could not be assigned to " + taskId);
|
||||
}
|
||||
}
|
||||
|
||||
private DynamicInputChunk(Path chunkFilePath,
|
||||
TaskAttemptContext taskAttemptContext)
|
||||
throws IOException, InterruptedException {
|
||||
if (!areInvariantsInitialized())
|
||||
initializeChunkInvariants(taskAttemptContext.getConfiguration());
|
||||
|
||||
this.chunkFilePath = chunkFilePath;
|
||||
openForRead(taskAttemptContext);
|
||||
}
|
||||
|
||||
private void openForRead(TaskAttemptContext taskAttemptContext)
|
||||
throws IOException, InterruptedException {
|
||||
reader = new SequenceFileRecordReader<K, V>();
|
||||
reader.initialize(new FileSplit(chunkFilePath, 0,
|
||||
DistCpUtils.getFileSize(chunkFilePath, configuration), null),
|
||||
taskAttemptContext);
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method that
|
||||
* 1. acquires a chunk for the specified map-task attempt
|
||||
* 2. returns a DynamicInputChunk associated with the acquired chunk-file.
|
||||
* @param taskAttemptContext The attempt-context for the map task that's
|
||||
* trying to acquire a chunk.
|
||||
* @return The acquired dynamic-chunk. The chunk-file is renamed to the
|
||||
* attempt-id (from the attempt-context.)
|
||||
* @throws IOException Exception on failure.
|
||||
* @throws InterruptedException Exception on failure.
|
||||
*/
|
||||
public static DynamicInputChunk acquire(TaskAttemptContext taskAttemptContext)
|
||||
throws IOException, InterruptedException {
|
||||
if (!areInvariantsInitialized())
|
||||
initializeChunkInvariants(taskAttemptContext.getConfiguration());
|
||||
|
||||
String taskId
|
||||
= taskAttemptContext.getTaskAttemptID().getTaskID().toString();
|
||||
Path acquiredFilePath = new Path(chunkRootPath, taskId);
|
||||
|
||||
if (fs.exists(acquiredFilePath)) {
|
||||
LOG.info("Acquiring pre-assigned chunk: " + acquiredFilePath);
|
||||
return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
|
||||
}
|
||||
|
||||
for (FileStatus chunkFile : getListOfChunkFiles()) {
|
||||
if (fs.rename(chunkFile.getPath(), acquiredFilePath)) {
|
||||
LOG.info(taskId + " acquired " + chunkFile.getPath());
|
||||
return new DynamicInputChunk(acquiredFilePath, taskAttemptContext);
|
||||
}
|
||||
else
|
||||
LOG.warn(taskId + " could not acquire " + chunkFile.getPath());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to be called to relinquish an acquired chunk. All streams open to
|
||||
* the chunk are closed, and the chunk-file is deleted.
|
||||
* @throws IOException Exception thrown on failure to release (i.e. delete)
|
||||
* the chunk file.
|
||||
*/
|
||||
public void release() throws IOException {
|
||||
close();
|
||||
if (!fs.delete(chunkFilePath, false)) {
|
||||
LOG.error("Unable to release chunk at path: " + chunkFilePath);
|
||||
throw new IOException("Unable to release chunk at path: " + chunkFilePath);
|
||||
}
|
||||
}
|
||||
|
||||
static FileStatus [] getListOfChunkFiles() throws IOException {
|
||||
Path chunkFilePattern = new Path(chunkRootPath, chunkFilePrefix + "*");
|
||||
FileStatus chunkFiles[] = fs.globStatus(chunkFilePattern);
|
||||
numChunksLeft = chunkFiles.length;
|
||||
return chunkFiles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for the chunk-file's path, on HDFS.
|
||||
* @return The qualified path to the chunk-file.
|
||||
*/
|
||||
public Path getPath() {
|
||||
return chunkFilePath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for the record-reader, opened to the chunk-file.
|
||||
* @return Opened Sequence-file reader.
|
||||
*/
|
||||
public SequenceFileRecordReader<K,V> getReader() {
|
||||
assert reader != null : "Reader un-initialized!";
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for the number of chunk-files left in the chunk-file directory.
|
||||
* Useful to determine how many chunks (and hence, records) are left to be
|
||||
* processed.
|
||||
* @return Before the first scan of the directory, the number returned is -1.
|
||||
* Otherwise, the number of chunk-files seen from the last scan is returned.
|
||||
*/
|
||||
public static int getNumChunksLeft() {
|
||||
return numChunksLeft;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,292 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.tools.mapred.lib;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.mapreduce.*;
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
|
||||
import org.apache.hadoop.tools.DistCpConstants;
|
||||
import org.apache.hadoop.tools.util.DistCpUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* DynamicInputFormat implements the "Worker pattern" for DistCp.
|
||||
* Rather than to split up the copy-list into a set of static splits,
|
||||
* the DynamicInputFormat does the following:
|
||||
* 1. Splits the copy-list into small chunks on the DFS.
|
||||
* 2. Creates a set of empty "dynamic" splits, that each consume as many chunks
|
||||
* as it can.
|
||||
* This arrangement ensures that a single slow mapper won't slow down the entire
|
||||
* job (since the slack will be picked up by other mappers, who consume more
|
||||
* chunks.)
|
||||
* By varying the split-ratio, one can vary chunk sizes to achieve different
|
||||
* performance characteristics.
|
||||
*/
|
||||
public class DynamicInputFormat<K, V> extends InputFormat<K, V> {
|
||||
private static final Log LOG = LogFactory.getLog(DynamicInputFormat.class);
|
||||
|
||||
private static final String CONF_LABEL_LISTING_SPLIT_RATIO
|
||||
= "mapred.listing.split.ratio";
|
||||
private static final String CONF_LABEL_NUM_SPLITS
|
||||
= "mapred.num.splits";
|
||||
private static final String CONF_LABEL_NUM_ENTRIES_PER_CHUNK
|
||||
= "mapred.num.entries.per.chunk";
|
||||
|
||||
/**
|
||||
* Implementation of InputFormat::getSplits(). This method splits up the
|
||||
* copy-listing file into chunks, and assigns the first batch to different
|
||||
* tasks.
|
||||
* @param jobContext JobContext for the map job.
|
||||
* @return The list of (empty) dynamic input-splits.
|
||||
* @throws IOException, on failure.
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
@Override
|
||||
public List<InputSplit> getSplits(JobContext jobContext)
|
||||
throws IOException, InterruptedException {
|
||||
LOG.info("DynamicInputFormat: Getting splits for job:"
|
||||
+ jobContext.getJobID());
|
||||
return createSplits(jobContext,
|
||||
splitCopyListingIntoChunksWithShuffle(jobContext));
|
||||
}
|
||||
|
||||
private List<InputSplit> createSplits(JobContext jobContext,
|
||||
List<DynamicInputChunk> chunks)
|
||||
throws IOException {
|
||||
int numMaps = getNumMapTasks(jobContext.getConfiguration());
|
||||
|
||||
final int nSplits = Math.min(numMaps, chunks.size());
|
||||
List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
|
||||
|
||||
for (int i=0; i< nSplits; ++i) {
|
||||
TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
|
||||
chunks.get(i).assignTo(taskId);
|
||||
splits.add(new FileSplit(chunks.get(i).getPath(), 0,
|
||||
// Setting non-zero length for FileSplit size, to avoid a possible
|
||||
// future when 0-sized file-splits are considered "empty" and skipped
|
||||
// over.
|
||||
MIN_RECORDS_PER_CHUNK,
|
||||
null));
|
||||
}
|
||||
DistCpUtils.publish(jobContext.getConfiguration(),
|
||||
CONF_LABEL_NUM_SPLITS, splits.size());
|
||||
return splits;
|
||||
}
|
||||
|
||||
private static int N_CHUNKS_OPEN_AT_ONCE_DEFAULT = 16;
|
||||
|
||||
private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle
|
||||
(JobContext context) throws IOException {
|
||||
|
||||
final Configuration configuration = context.getConfiguration();
|
||||
int numRecords = getNumberOfRecords(configuration);
|
||||
int numMaps = getNumMapTasks(configuration);
|
||||
// Number of chunks each map will process, on average.
|
||||
int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
|
||||
validateNumChunksUsing(splitRatio, numMaps);
|
||||
|
||||
int numEntriesPerChunk = (int)Math.ceil((float)numRecords
|
||||
/(splitRatio * numMaps));
|
||||
DistCpUtils.publish(context.getConfiguration(),
|
||||
CONF_LABEL_NUM_ENTRIES_PER_CHUNK,
|
||||
numEntriesPerChunk);
|
||||
|
||||
final int nChunksTotal = (int)Math.ceil((float)numRecords/numEntriesPerChunk);
|
||||
int nChunksOpenAtOnce
|
||||
= Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);
|
||||
|
||||
Path listingPath = getListingFilePath(configuration);
|
||||
SequenceFile.Reader reader
|
||||
= new SequenceFile.Reader(configuration,
|
||||
SequenceFile.Reader.file(listingPath));
|
||||
|
||||
List<DynamicInputChunk> openChunks
|
||||
= new ArrayList<DynamicInputChunk>();
|
||||
|
||||
List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();
|
||||
|
||||
FileStatus fileStatus = new FileStatus();
|
||||
Text relPath = new Text();
|
||||
int recordCounter = 0;
|
||||
int chunkCount = 0;
|
||||
|
||||
try {
|
||||
|
||||
while (reader.next(relPath, fileStatus)) {
|
||||
if (recordCounter % (nChunksOpenAtOnce*numEntriesPerChunk) == 0) {
|
||||
// All chunks full. Create new chunk-set.
|
||||
closeAll(openChunks);
|
||||
chunksFinal.addAll(openChunks);
|
||||
|
||||
openChunks = createChunks(
|
||||
configuration, chunkCount, nChunksTotal, nChunksOpenAtOnce);
|
||||
|
||||
chunkCount += openChunks.size();
|
||||
|
||||
nChunksOpenAtOnce = openChunks.size();
|
||||
recordCounter = 0;
|
||||
}
|
||||
|
||||
// Shuffle into open chunks.
|
||||
openChunks.get(recordCounter%nChunksOpenAtOnce).write(relPath, fileStatus);
|
||||
++recordCounter;
|
||||
}
|
||||
|
||||
} finally {
|
||||
closeAll(openChunks);
|
||||
chunksFinal.addAll(openChunks);
|
||||
IOUtils.closeStream(reader);
|
||||
}
|
||||
|
||||
LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
|
||||
return chunksFinal;
|
||||
}
|
||||
|
||||
private static void validateNumChunksUsing(int splitRatio, int numMaps)
|
||||
throws IOException {
|
||||
if (splitRatio * numMaps > MAX_CHUNKS_TOLERABLE)
|
||||
throw new IOException("Too many chunks created with splitRatio:"
|
||||
+ splitRatio + ", numMaps:" + numMaps
|
||||
+ ". Reduce numMaps or decrease split-ratio to proceed.");
|
||||
}
|
||||
|
||||
private static void closeAll(List<DynamicInputChunk> chunks) {
|
||||
for (DynamicInputChunk chunk: chunks)
|
||||
chunk.close();
|
||||
}
|
||||
|
||||
private static List<DynamicInputChunk> createChunks(Configuration config,
|
||||
int chunkCount, int nChunksTotal, int nChunksOpenAtOnce)
|
||||
throws IOException {
|
||||
List<DynamicInputChunk> chunks = new ArrayList<DynamicInputChunk>();
|
||||
int chunkIdUpperBound
|
||||
= Math.min(nChunksTotal, chunkCount + nChunksOpenAtOnce);
|
||||
|
||||
// If there will be fewer than nChunksOpenAtOnce chunks left after
|
||||
// the current batch of chunks, fold the remaining chunks into
|
||||
// the current batch.
|
||||
if (nChunksTotal - chunkIdUpperBound < nChunksOpenAtOnce)
|
||||
chunkIdUpperBound = nChunksTotal;
|
||||
|
||||
for (int i=chunkCount; i < chunkIdUpperBound; ++i)
|
||||
chunks.add(createChunk(i, config));
|
||||
return chunks;
|
||||
}
|
||||
|
||||
private static DynamicInputChunk createChunk(int chunkId, Configuration config)
|
||||
throws IOException {
|
||||
return DynamicInputChunk.createChunkForWrite(String.format("%05d", chunkId),
|
||||
config);
|
||||
}
|
||||
|
||||
|
||||
private static Path getListingFilePath(Configuration configuration) {
|
||||
String listingFilePathString = configuration.get(
|
||||
DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");
|
||||
|
||||
assert !listingFilePathString.equals("") : "Listing file not found.";
|
||||
|
||||
Path listingFilePath = new Path(listingFilePathString);
|
||||
try {
|
||||
assert listingFilePath.getFileSystem(configuration)
|
||||
.exists(listingFilePath) : "Listing file: " + listingFilePath +
|
||||
" not found.";
|
||||
} catch (IOException e) {
|
||||
assert false : "Listing file: " + listingFilePath
|
||||
+ " couldn't be accessed. " + e.getMessage();
|
||||
}
|
||||
return listingFilePath;
|
||||
}
|
||||
|
||||
private static int getNumberOfRecords(Configuration configuration) {
|
||||
return DistCpUtils.getInt(configuration,
|
||||
DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
|
||||
}
|
||||
|
||||
private static int getNumMapTasks(Configuration configuration) {
|
||||
return DistCpUtils.getInt(configuration,
|
||||
JobContext.NUM_MAPS);
|
||||
}
|
||||
|
||||
private static int getListingSplitRatio(Configuration configuration,
|
||||
int numMaps, int numPaths) {
|
||||
return configuration.getInt(
|
||||
CONF_LABEL_LISTING_SPLIT_RATIO,
|
||||
getSplitRatio(numMaps, numPaths));
|
||||
}
|
||||
|
||||
private static final int MAX_CHUNKS_TOLERABLE = 400;
|
||||
private static final int MAX_CHUNKS_IDEAL = 100;
|
||||
private static final int MIN_RECORDS_PER_CHUNK = 5;
|
||||
private static final int SPLIT_RATIO_DEFAULT = 2;
|
||||
|
||||
/**
|
||||
* Package private, for testability.
|
||||
* @param nMaps The number of maps requested for.
|
||||
* @param nRecords The number of records to be copied.
|
||||
* @return The number of splits each map should handle, ideally.
|
||||
*/
|
||||
static int getSplitRatio(int nMaps, int nRecords) {
|
||||
if (nMaps == 1) {
|
||||
LOG.warn("nMaps == 1. Why use DynamicInputFormat?");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (nMaps > MAX_CHUNKS_IDEAL)
|
||||
return SPLIT_RATIO_DEFAULT;
|
||||
|
||||
int nPickups = (int)Math.ceil((float)MAX_CHUNKS_IDEAL/nMaps);
|
||||
int nRecordsPerChunk = (int)Math.ceil((float)nRecords/(nMaps*nPickups));
|
||||
|
||||
return nRecordsPerChunk < MIN_RECORDS_PER_CHUNK ?
|
||||
SPLIT_RATIO_DEFAULT : nPickups;
|
||||
}
|
||||
|
||||
static int getNumEntriesPerChunk(Configuration configuration) {
|
||||
return DistCpUtils.getInt(configuration,
|
||||
CONF_LABEL_NUM_ENTRIES_PER_CHUNK);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implementation of Inputformat::createRecordReader().
|
||||
* @param inputSplit The split for which the RecordReader is required.
|
||||
* @param taskAttemptContext TaskAttemptContext for the current attempt.
|
||||
* @return DynamicRecordReader instance.
|
||||
* @throws IOException, on failure.
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
@Override
|
||||
public RecordReader<K, V> createRecordReader(
|
||||
InputSplit inputSplit,
|
||||
TaskAttemptContext taskAttemptContext)
|
||||
throws IOException, InterruptedException {
|
||||
return new DynamicRecordReader<K, V>();
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue