Merge r1242606 through r1244221 from 0.23.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23-PB@1244226 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2012-02-14 21:49:02 +00:00
commit fd0b6a9596
37 changed files with 717 additions and 176 deletions

View File

@ -68,6 +68,8 @@ Release 0.23.2 - UNRELEASED
NEW FEATURES NEW FEATURES
IMPROVEMENTS IMPROVEMENTS
HADOOP-8048. Allow merging of Credentials (Daryn Sharp via tgraves)
HADOOP-8032. mvn site:stage-deploy should be able to use the scp protocol HADOOP-8032. mvn site:stage-deploy should be able to use the scp protocol
to stage documents (Ravi Prakash via tgraves) to stage documents (Ravi Prakash via tgraves)
@ -75,6 +77,8 @@ Release 0.23.2 - UNRELEASED
(szetszwo) (szetszwo)
OPTIMIZATIONS OPTIMIZATIONS
HADOOP-8071. Avoid an extra packet in client code when nagling is
disabled. (todd)
BUG FIXES BUG FIXES
@ -85,6 +89,14 @@ Release 0.23.2 - UNRELEASED
HADOOP-8035 Hadoop Maven site is inefficient and runs phases redundantly HADOOP-8035 Hadoop Maven site is inefficient and runs phases redundantly
(abayer via tucu) (abayer via tucu)
HADOOP-8051 HttpFS documentation it is not wired to the generated site (tucu)
HADOOP-8055. Hadoop tarball distribution lacks a core-site.xml (harsh)
HADOOP-8052. Hadoop Metrics2 should emit Float.MAX_VALUE (instead of
Double.MAX_VALUE) to avoid making Ganglia's gmetad core. (Varun Kapoor
via mattf)
Release 0.23.1 - 2012-02-08 Release 0.23.1 - 2012-02-08
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
</configuration>

View File

@ -799,9 +799,12 @@ public class Client {
header.write(d); header.write(d);
call.rpcRequest.write(d); call.rpcRequest.write(d);
byte[] data = d.getData(); byte[] data = d.getData();
int dataLength = d.getLength(); int dataLength = d.getLength() - 4;
out.writeInt(dataLength); //first put the data length data[0] = (byte)((dataLength >>> 24) & 0xff);
out.write(data, 0, dataLength);//write the data data[1] = (byte)((dataLength >>> 16) & 0xff);
data[2] = (byte)((dataLength >>> 8) & 0xff);
data[3] = (byte)(dataLength & 0xff);
out.write(data, 0, dataLength + 4);//write the data
out.flush(); out.flush();
} }
} catch(IOException e) { } catch(IOException e) {

View File

@ -143,8 +143,16 @@ public class SampleStat {
@SuppressWarnings("PublicInnerClass") @SuppressWarnings("PublicInnerClass")
public static class MinMax { public static class MinMax {
private double min = Double.MAX_VALUE; // Float.MAX_VALUE is used rather than Double.MAX_VALUE, even though the
private double max = Double.MIN_VALUE; // min and max variables are of type double.
// Float.MAX_VALUE is big enough, and using Double.MAX_VALUE makes
// Ganglia core due to buffer overflow.
// The same reasoning applies to the MIN_VALUE counterparts.
static final double DEFAULT_MIN_VALUE = Float.MAX_VALUE;
static final double DEFAULT_MAX_VALUE = Float.MIN_VALUE;
private double min = DEFAULT_MIN_VALUE;
private double max = DEFAULT_MAX_VALUE;
public void add(double value) { public void add(double value) {
if (value > max) max = value; if (value > max) max = value;
@ -155,8 +163,8 @@ public class SampleStat {
public double max() { return max; } public double max() { return max; }
public void reset() { public void reset() {
min = Double.MAX_VALUE; min = DEFAULT_MIN_VALUE;
max = Double.MIN_VALUE; max = DEFAULT_MAX_VALUE;
} }
public void reset(MinMax other) { public void reset(MinMax other) {

View File

@ -230,14 +230,34 @@ public class Credentials implements Writable {
/** /**
* Copy all of the credentials from one credential object into another. * Copy all of the credentials from one credential object into another.
* Existing secrets and tokens are overwritten.
* @param other the credentials to copy * @param other the credentials to copy
*/ */
public void addAll(Credentials other) { public void addAll(Credentials other) {
addAll(other, true);
}
/**
* Copy all of the credentials from one credential object into another.
* Existing secrets and tokens are not overwritten.
* @param other the credentials to copy
*/
public void mergeAll(Credentials other) {
addAll(other, false);
}
private void addAll(Credentials other, boolean overwrite) {
for(Map.Entry<Text, byte[]> secret: other.secretKeysMap.entrySet()) { for(Map.Entry<Text, byte[]> secret: other.secretKeysMap.entrySet()) {
secretKeysMap.put(secret.getKey(), secret.getValue()); Text key = secret.getKey();
if (!secretKeysMap.containsKey(key) || overwrite) {
secretKeysMap.put(key, secret.getValue());
}
} }
for(Map.Entry<Text, Token<?>> token: other.tokenMap.entrySet()){ for(Map.Entry<Text, Token<?>> token: other.tokenMap.entrySet()){
tokenMap.put(token.getKey(), token.getValue()); Text key = token.getKey();
if (!tokenMap.containsKey(key) || overwrite) {
tokenMap.put(key, token.getValue());
}
} }
} }
} }

View File

@ -36,8 +36,8 @@ public class TestSampleStat {
assertEquals("mean", 0.0, stat.mean(), EPSILON); assertEquals("mean", 0.0, stat.mean(), EPSILON);
assertEquals("variance", 0.0, stat.variance(), EPSILON); assertEquals("variance", 0.0, stat.variance(), EPSILON);
assertEquals("stddev", 0.0, stat.stddev(), EPSILON); assertEquals("stddev", 0.0, stat.stddev(), EPSILON);
assertEquals("min", Double.MAX_VALUE, stat.min(), EPSILON); assertEquals("min", SampleStat.MinMax.DEFAULT_MIN_VALUE, stat.min(), EPSILON);
assertEquals("max", Double.MIN_VALUE, stat.max(), EPSILON); assertEquals("max", SampleStat.MinMax.DEFAULT_MAX_VALUE, stat.max(), EPSILON);
stat.add(3); stat.add(3);
assertEquals("num samples", 1L, stat.numSamples()); assertEquals("num samples", 1L, stat.numSamples());
@ -60,8 +60,8 @@ public class TestSampleStat {
assertEquals("mean", 0.0, stat.mean(), EPSILON); assertEquals("mean", 0.0, stat.mean(), EPSILON);
assertEquals("variance", 0.0, stat.variance(), EPSILON); assertEquals("variance", 0.0, stat.variance(), EPSILON);
assertEquals("stddev", 0.0, stat.stddev(), EPSILON); assertEquals("stddev", 0.0, stat.stddev(), EPSILON);
assertEquals("min", Double.MAX_VALUE, stat.min(), EPSILON); assertEquals("min", SampleStat.MinMax.DEFAULT_MIN_VALUE, stat.min(), EPSILON);
assertEquals("max", Double.MIN_VALUE, stat.max(), EPSILON); assertEquals("max", SampleStat.MinMax.DEFAULT_MAX_VALUE, stat.max(), EPSILON);
} }
} }

View File

@ -137,4 +137,81 @@ public class TestCredentials {
} }
tmpFileName.delete(); tmpFileName.delete();
} }
static Text secret[] = {
new Text("secret1"),
new Text("secret2"),
new Text("secret3"),
new Text("secret4")
};
static Text service[] = {
new Text("service1"),
new Text("service2"),
new Text("service3"),
new Text("service4")
};
static Token<?> token[] = {
new Token<TokenIdentifier>(),
new Token<TokenIdentifier>(),
new Token<TokenIdentifier>(),
new Token<TokenIdentifier>()
};
@Test
public void addAll() {
Credentials creds = new Credentials();
creds.addToken(service[0], token[0]);
creds.addToken(service[1], token[1]);
creds.addSecretKey(secret[0], secret[0].getBytes());
creds.addSecretKey(secret[1], secret[1].getBytes());
Credentials credsToAdd = new Credentials();
// one duplicate with different value, one new
credsToAdd.addToken(service[0], token[3]);
credsToAdd.addToken(service[2], token[2]);
credsToAdd.addSecretKey(secret[0], secret[3].getBytes());
credsToAdd.addSecretKey(secret[2], secret[2].getBytes());
creds.addAll(credsToAdd);
assertEquals(3, creds.numberOfTokens());
assertEquals(3, creds.numberOfSecretKeys());
// existing token & secret should be overwritten
assertEquals(token[3], creds.getToken(service[0]));
assertEquals(secret[3], new Text(creds.getSecretKey(secret[0])));
// non-duplicate token & secret should be present
assertEquals(token[1], creds.getToken(service[1]));
assertEquals(secret[1], new Text(creds.getSecretKey(secret[1])));
// new token & secret should be added
assertEquals(token[2], creds.getToken(service[2]));
assertEquals(secret[2], new Text(creds.getSecretKey(secret[2])));
}
@Test
public void mergeAll() {
Credentials creds = new Credentials();
creds.addToken(service[0], token[0]);
creds.addToken(service[1], token[1]);
creds.addSecretKey(secret[0], secret[0].getBytes());
creds.addSecretKey(secret[1], secret[1].getBytes());
Credentials credsToAdd = new Credentials();
// one duplicate with different value, one new
credsToAdd.addToken(service[0], token[3]);
credsToAdd.addToken(service[2], token[2]);
credsToAdd.addSecretKey(secret[0], secret[3].getBytes());
credsToAdd.addSecretKey(secret[2], secret[2].getBytes());
creds.mergeAll(credsToAdd);
assertEquals(3, creds.numberOfTokens());
assertEquals(3, creds.numberOfSecretKeys());
// existing token & secret should not be overwritten
assertEquals(token[0], creds.getToken(service[0]));
assertEquals(secret[0], new Text(creds.getSecretKey(secret[0])));
// non-duplicate token & secret should be present
assertEquals(token[1], creds.getToken(service[1]));
assertEquals(secret[1], new Text(creds.getSecretKey(secret[1])));
// new token & secret should be added
assertEquals(token[2], creds.getToken(service[2]));
assertEquals(secret[2], new Text(creds.getSecretKey(secret[2])));
} }
}

View File

@ -14,21 +14,16 @@
--> -->
<project name="HttpFS"> <project name="HttpFS">
<version position="right"/> <skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-stylus-skin</artifactId>
<version>1.2</version>
</skin>
<bannerLeft> <body>
<name>&nbsp;</name> <links>
</bannerLeft> <item name="Apache Hadoop" href="http://hadoop.apache.org/"/>
</links>
<skin> </body>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-stylus-skin</artifactId>
<version>1.2</version>
</skin>
<body>
<links>
</links>
</body>
</project> </project>

View File

@ -130,8 +130,14 @@ Release 0.23.2 - UNRELEASED
NEW FEATURES NEW FEATURES
HDFS-2943. Expose last checkpoint time and transaction stats as JMX
metrics. (atm)
IMPROVEMENTS IMPROVEMENTS
HDFS-2931. Switch DataNode's BlockVolumeChoosingPolicy to private-audience.
(harsh via szetszwo)
OPTIMIZATIONS OPTIMIZATIONS
BUG FIXES BUG FIXES
@ -140,6 +146,14 @@ Release 0.23.2 - UNRELEASED
HDFS-2764. TestBackupNode is racy. (atm) HDFS-2764. TestBackupNode is racy. (atm)
HDFS-2869. Fix an error in the webhdfs docs for the mkdir op (harsh)
HDFS-776. Fix exception handling in Balancer. (Uma Maheswara Rao G
via szetszwo)
HDFS-2815. Namenode sometimes oes not come out of safemode during
NN crash + restart. (Uma Maheswara Rao via suresh)
Release 0.23.1 - 2012-02-08 Release 0.23.1 - 2012-02-08
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -349,7 +349,7 @@ Hello, webhdfs user!
<ul> <ul>
<li>Submit a HTTP PUT request. <li>Submit a HTTP PUT request.
<source> <source>
curl -i -X PUT "http://&lt;HOST&gt;:&lt;PORT&gt;/&lt;PATH&gt;?op=MKDIRS[&amp;permission=&lt;OCTAL&gt;]" curl -i -X PUT "http://&lt;HOST&gt;:&lt;PORT&gt;/webhdfs/v1/&lt;PATH&gt;?op=MKDIRS[&amp;permission=&lt;OCTAL&gt;]"
</source> </source>
The client receives a response with a <a href="#boolean"><code>boolean</code> JSON object</a>: The client receives a response with a <a href="#boolean"><code>boolean</code> JSON object</a>:
<source> <source>

View File

@ -125,6 +125,10 @@ class NameNodeConnector {
if (!isBlockTokenEnabled) { if (!isBlockTokenEnabled) {
return BlockTokenSecretManager.DUMMY_TOKEN; return BlockTokenSecretManager.DUMMY_TOKEN;
} else { } else {
if (!shouldRun) {
throw new IOException(
"Can not get access token. BlockKeyUpdater is not running");
}
return blockTokenSecretManager.generateToken(null, eb, return blockTokenSecretManager.generateToken(null, eb,
EnumSet.of(BlockTokenSecretManager.AccessMode.REPLACE, EnumSet.of(BlockTokenSecretManager.AccessMode.REPLACE,
BlockTokenSecretManager.AccessMode.COPY)); BlockTokenSecretManager.AccessMode.COPY));
@ -221,16 +225,20 @@ class NameNodeConnector {
*/ */
class BlockKeyUpdater implements Runnable { class BlockKeyUpdater implements Runnable {
public void run() { public void run() {
while (shouldRun) { try {
try { while (shouldRun) {
blockTokenSecretManager.setKeys(namenode.getBlockKeys()); try {
} catch (Exception e) { blockTokenSecretManager.setKeys(namenode.getBlockKeys());
LOG.error("Failed to set keys", e); } catch (IOException e) {
} LOG.error("Failed to set keys", e);
try { }
Thread.sleep(keyUpdaterInterval); Thread.sleep(keyUpdaterInterval);
} catch (InterruptedException ie) {
} }
} catch (InterruptedException e) {
LOG.info("InterruptedException in block key updater thread", e);
} catch (Throwable e) {
LOG.error("Exception in block key updater thread", e);
shouldRun = false;
} }
} }
} }

View File

@ -28,10 +28,12 @@ import org.apache.hadoop.hdfs.server.datanode.FSDatasetInterface.FSVolumeInterfa
* BlockVolumeChoosingPolicy allows a DataNode to * BlockVolumeChoosingPolicy allows a DataNode to
* specify what policy is to be used while choosing * specify what policy is to be used while choosing
* a volume for a block request. * a volume for a block request.
* *
* Note: This is an evolving i/f and is only for
* advanced use.
*
***************************************************/ ***************************************************/
@InterfaceAudience.Public @InterfaceAudience.Private
@InterfaceStability.Evolving
public interface BlockVolumeChoosingPolicy { public interface BlockVolumeChoosingPolicy {
/** /**

View File

@ -710,7 +710,7 @@ public class FSImage implements Closeable {
long txId = loader.getLoadedImageTxId(); long txId = loader.getLoadedImageTxId();
LOG.info("Loaded image for txid " + txId + " from " + curFile); LOG.info("Loaded image for txid " + txId + " from " + curFile);
lastAppliedTxId = txId; lastAppliedTxId = txId;
storage.setMostRecentCheckpointTxId(txId); storage.setMostRecentCheckpointInfo(txId, curFile.lastModified());
} }
/** /**
@ -726,7 +726,7 @@ public class FSImage implements Closeable {
saver.save(newFile, txid, source, compression); saver.save(newFile, txid, source, compression);
MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest()); MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest());
storage.setMostRecentCheckpointTxId(txid); storage.setMostRecentCheckpointInfo(txid, Util.now());
} }
/** /**
@ -988,7 +988,7 @@ public class FSImage implements Closeable {
// advertise it as such to other checkpointers // advertise it as such to other checkpointers
// from now on // from now on
if (txid > storage.getMostRecentCheckpointTxId()) { if (txid > storage.getMostRecentCheckpointTxId()) {
storage.setMostRecentCheckpointTxId(txid); storage.setMostRecentCheckpointInfo(txid, Util.now());
} }
} }

View File

@ -1933,7 +1933,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
boolean enforcePermission) boolean enforcePermission)
throws AccessControlException, SafeModeException, UnresolvedLinkException, throws AccessControlException, SafeModeException, UnresolvedLinkException,
IOException { IOException {
boolean deleteNow = false;
ArrayList<Block> collectedBlocks = new ArrayList<Block>(); ArrayList<Block> collectedBlocks = new ArrayList<Block>();
writeLock(); writeLock();
@ -1951,10 +1950,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
if (!dir.delete(src, collectedBlocks)) { if (!dir.delete(src, collectedBlocks)) {
return false; return false;
} }
deleteNow = collectedBlocks.size() <= BLOCK_DELETION_INCREMENT;
if (deleteNow) { // Perform small deletes right away
removeBlocks(collectedBlocks);
}
} finally { } finally {
writeUnlock(); writeUnlock();
} }
@ -1963,9 +1958,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
writeLock(); writeLock();
try { try {
if (!deleteNow) { removeBlocks(collectedBlocks); // Incremental deletion of blocks
removeBlocks(collectedBlocks); // Incremental deletion of blocks
}
} finally { } finally {
writeUnlock(); writeUnlock();
} }
@ -2659,6 +2652,31 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
public int getExpiredHeartbeats() { public int getExpiredHeartbeats() {
return datanodeStatistics.getExpiredHeartbeats(); return datanodeStatistics.getExpiredHeartbeats();
} }
@Metric({"TransactionsSinceLastCheckpoint",
"Number of transactions since last checkpoint"})
public long getTransactionsSinceLastCheckpoint() {
return getEditLog().getLastWrittenTxId() -
getFSImage().getStorage().getMostRecentCheckpointTxId();
}
@Metric({"TransactionsSinceLastLogRoll",
"Number of transactions since last edit log roll"})
public long getTransactionsSinceLastLogRoll() {
return (getEditLog().getLastWrittenTxId() -
getEditLog().getCurSegmentTxId()) + 1;
}
@Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
public long getLastWrittenTransactionId() {
return getEditLog().getLastWrittenTxId();
}
@Metric({"LastCheckpointTime",
"Time in milliseconds since the epoch of the last checkpoint"})
public long getLastCheckpointTime() {
return getFSImage().getStorage().getMostRecentCheckpointTime();
}
/** @see ClientProtocol#getStats() */ /** @see ClientProtocol#getStats() */
long[] getStats() { long[] getStats() {

View File

@ -125,6 +125,11 @@ public class NNStorage extends Storage implements Closeable {
* that have since been written to the edit log. * that have since been written to the edit log.
*/ */
protected long mostRecentCheckpointTxId = HdfsConstants.INVALID_TXID; protected long mostRecentCheckpointTxId = HdfsConstants.INVALID_TXID;
/**
* Time of the last checkpoint, in milliseconds since the epoch.
*/
private long mostRecentCheckpointTime = 0;
/** /**
* list of failed (and thus removed) storages * list of failed (and thus removed) storages
@ -417,18 +422,29 @@ public class NNStorage extends Storage implements Closeable {
} }
/** /**
* Set the transaction ID of the last checkpoint * Set the transaction ID and time of the last checkpoint
*
* @param txid transaction id of the last checkpoint
* @param time time of the last checkpoint, in millis since the epoch
*/ */
void setMostRecentCheckpointTxId(long txid) { void setMostRecentCheckpointInfo(long txid, long time) {
this.mostRecentCheckpointTxId = txid; this.mostRecentCheckpointTxId = txid;
this.mostRecentCheckpointTime = time;
} }
/** /**
* Return the transaction ID of the last checkpoint. * @return the transaction ID of the last checkpoint.
*/ */
long getMostRecentCheckpointTxId() { long getMostRecentCheckpointTxId() {
return mostRecentCheckpointTxId; return mostRecentCheckpointTxId;
} }
/**
* @return the time of the most recent checkpoint in millis since the epoch.
*/
long getMostRecentCheckpointTime() {
return mostRecentCheckpointTime;
}
/** /**
* Write a small file in all available storage directories that * Write a small file in all available storage directories that

View File

@ -57,14 +57,12 @@ public class TestBalancerWithMultipleNameNodes {
((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.OFF); ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.OFF);
((Log4JLogger)LeaseManager.LOG).getLogger().setLevel(Level.OFF); ((Log4JLogger)LeaseManager.LOG).getLogger().setLevel(Level.OFF);
((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.OFF); ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.OFF);
// ((Log4JLogger)DataNode.LOG).getLogger().setLevel(Level.OFF);
} }
private static final long CAPACITY = 500L; private static final long CAPACITY = 500L;
private static final String RACK0 = "/rack0"; private static final String RACK0 = "/rack0";
private static final String RACK1 = "/rack1"; private static final String RACK1 = "/rack1";
private static final String RACK2 = "/rack2";
private static final String FILE_NAME = "/tmp.txt"; private static final String FILE_NAME = "/tmp.txt";
private static final Path FILE_PATH = new Path(FILE_NAME); private static final Path FILE_PATH = new Path(FILE_NAME);

View File

@ -20,13 +20,12 @@ package org.apache.hadoop.hdfs.server.namenode.metrics;
import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
import static org.apache.hadoop.test.MetricsAsserts.assertGauge; import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
import static org.junit.Assert.*;
import java.io.DataInputStream; import java.io.DataInputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Random; import java.util.Random;
import junit.framework.TestCase;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Log4JLogger; import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -39,17 +38,21 @@ import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.test.MetricsAsserts; import org.apache.hadoop.test.MetricsAsserts;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
/** /**
* Test for metrics published by the Namenode * Test for metrics published by the Namenode
*/ */
public class TestNameNodeMetrics extends TestCase { public class TestNameNodeMetrics {
private static final Configuration CONF = new HdfsConfiguration(); private static final Configuration CONF = new HdfsConfiguration();
private static final int DFS_REPLICATION_INTERVAL = 1; private static final int DFS_REPLICATION_INTERVAL = 1;
private static final Path TEST_ROOT_DIR_PATH = private static final Path TEST_ROOT_DIR_PATH =
@ -81,8 +84,8 @@ public class TestNameNodeMetrics extends TestCase {
return new Path(TEST_ROOT_DIR_PATH, fileName); return new Path(TEST_ROOT_DIR_PATH, fileName);
} }
@Override @Before
protected void setUp() throws Exception { public void setUp() throws Exception {
cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT).build(); cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT).build();
cluster.waitActive(); cluster.waitActive();
namesystem = cluster.getNamesystem(); namesystem = cluster.getNamesystem();
@ -90,8 +93,8 @@ public class TestNameNodeMetrics extends TestCase {
fs = (DistributedFileSystem) cluster.getFileSystem(); fs = (DistributedFileSystem) cluster.getFileSystem();
} }
@Override @After
protected void tearDown() throws Exception { public void tearDown() throws Exception {
cluster.shutdown(); cluster.shutdown();
} }
@ -115,6 +118,7 @@ public class TestNameNodeMetrics extends TestCase {
} }
/** Test metrics associated with addition of a file */ /** Test metrics associated with addition of a file */
@Test
public void testFileAdd() throws Exception { public void testFileAdd() throws Exception {
// Add files with 100 blocks // Add files with 100 blocks
final Path file = getTestPath("testFileAdd"); final Path file = getTestPath("testFileAdd");
@ -161,6 +165,7 @@ public class TestNameNodeMetrics extends TestCase {
} }
/** Corrupt a block and ensure metrics reflects it */ /** Corrupt a block and ensure metrics reflects it */
@Test
public void testCorruptBlock() throws Exception { public void testCorruptBlock() throws Exception {
// Create a file with single block with two replicas // Create a file with single block with two replicas
final Path file = getTestPath("testCorruptBlock"); final Path file = getTestPath("testCorruptBlock");
@ -186,6 +191,7 @@ public class TestNameNodeMetrics extends TestCase {
/** Create excess blocks by reducing the replication factor for /** Create excess blocks by reducing the replication factor for
* for a file and ensure metrics reflects it * for a file and ensure metrics reflects it
*/ */
@Test
public void testExcessBlocks() throws Exception { public void testExcessBlocks() throws Exception {
Path file = getTestPath("testExcessBlocks"); Path file = getTestPath("testExcessBlocks");
createFile(file, 100, (short)2); createFile(file, 100, (short)2);
@ -198,6 +204,7 @@ public class TestNameNodeMetrics extends TestCase {
} }
/** Test to ensure metrics reflects missing blocks */ /** Test to ensure metrics reflects missing blocks */
@Test
public void testMissingBlock() throws Exception { public void testMissingBlock() throws Exception {
// Create a file with single block with two replicas // Create a file with single block with two replicas
Path file = getTestPath("testMissingBlocks"); Path file = getTestPath("testMissingBlocks");
@ -216,6 +223,7 @@ public class TestNameNodeMetrics extends TestCase {
assertGauge("UnderReplicatedBlocks", 0L, getMetrics(NS_METRICS)); assertGauge("UnderReplicatedBlocks", 0L, getMetrics(NS_METRICS));
} }
@Test
public void testRenameMetrics() throws Exception { public void testRenameMetrics() throws Exception {
Path src = getTestPath("src"); Path src = getTestPath("src");
createFile(src, 100, (short)1); createFile(src, 100, (short)1);
@ -240,7 +248,8 @@ public class TestNameNodeMetrics extends TestCase {
* *
* @throws IOException in case of an error * @throws IOException in case of an error
*/ */
public void testGetBlockLocationMetric() throws Exception{ @Test
public void testGetBlockLocationMetric() throws Exception {
Path file1_Path = new Path(TEST_ROOT_DIR_PATH, "file1.dat"); Path file1_Path = new Path(TEST_ROOT_DIR_PATH, "file1.dat");
// When cluster starts first time there are no file (read,create,open) // When cluster starts first time there are no file (read,create,open)
@ -268,4 +277,46 @@ public class TestNameNodeMetrics extends TestCase {
updateMetrics(); updateMetrics();
assertCounter("GetBlockLocations", 3L, getMetrics(NN_METRICS)); assertCounter("GetBlockLocations", 3L, getMetrics(NN_METRICS));
} }
/**
* Test NN checkpoint and transaction-related metrics.
*/
@Test
public void testTransactionAndCheckpointMetrics() throws Exception {
long lastCkptTime = MetricsAsserts.getLongGauge("LastCheckpointTime",
getMetrics(NS_METRICS));
assertGauge("LastCheckpointTime", lastCkptTime, getMetrics(NS_METRICS));
assertGauge("LastWrittenTransactionId", 1L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastCheckpoint", 1L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastLogRoll", 1L, getMetrics(NS_METRICS));
fs.mkdirs(new Path(TEST_ROOT_DIR_PATH, "/tmp"));
updateMetrics();
assertGauge("LastCheckpointTime", lastCkptTime, getMetrics(NS_METRICS));
assertGauge("LastWrittenTransactionId", 2L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastCheckpoint", 2L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastLogRoll", 2L, getMetrics(NS_METRICS));
cluster.getNameNodeRpc().rollEditLog();
updateMetrics();
assertGauge("LastCheckpointTime", lastCkptTime, getMetrics(NS_METRICS));
assertGauge("LastWrittenTransactionId", 4L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastCheckpoint", 4L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastLogRoll", 1L, getMetrics(NS_METRICS));
cluster.getNameNodeRpc().setSafeMode(SafeModeAction.SAFEMODE_ENTER);
cluster.getNameNodeRpc().saveNamespace();
cluster.getNameNodeRpc().setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
updateMetrics();
long newLastCkptTime = MetricsAsserts.getLongGauge("LastCheckpointTime",
getMetrics(NS_METRICS));
assertTrue(lastCkptTime < newLastCkptTime);
assertGauge("LastWrittenTransactionId", 6L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastCheckpoint", 1L, getMetrics(NS_METRICS));
assertGauge("TransactionsSinceLastLogRoll", 1L, getMetrics(NS_METRICS));
}
} }

View File

@ -28,6 +28,8 @@ Release 0.23-PB - Unreleased
Release 0.23.2 - UNRELEASED Release 0.23.2 - UNRELEASED
INCOMPATIBLE CHANGES
NEW FEATURES NEW FEATURES
IMPROVEMENTS IMPROVEMENTS
@ -36,9 +38,17 @@ Release 0.23.2 - UNRELEASED
BUG FIXES BUG FIXES
MAPREDUCE-3680. FifoScheduler web service rest API can print out invalid
JSON. (B Anil Kumar via tgraves)
MAPREDUCE-3852. Test TestLinuxResourceCalculatorPlugin failing. (Thomas
Graves via mahadev)
Release 0.23.1 - 2012-02-08 Release 0.23.1 - 2012-02-08
NEW FEATURES NEW FEATURES
MAPREDUCE-778. Rumen Anonymizer. (Amar Kamat and Chris Douglas via amarrk)
MAPREDUCE-3121. NodeManager should handle disk-failures (Ravi Gummadi via mahadev) MAPREDUCE-3121. NodeManager should handle disk-failures (Ravi Gummadi via mahadev)
@ -51,6 +61,7 @@ Release 0.23.1 - 2012-02-08
MAPREDUCE-778. Rumen Anonymizer. (Amar Kamat and Chris Douglas via amarrk) MAPREDUCE-778. Rumen Anonymizer. (Amar Kamat and Chris Douglas via amarrk)
IMPROVEMENTS IMPROVEMENTS
MAPREDUCE-3481. [Gridmix] Improve Gridmix STRESS mode. (amarrk) MAPREDUCE-3481. [Gridmix] Improve Gridmix STRESS mode. (amarrk)
MAPREDUCE-3597. [Rumen] Rumen should provide APIs to access all the MAPREDUCE-3597. [Rumen] Rumen should provide APIs to access all the
@ -58,9 +69,10 @@ Release 0.23.1 - 2012-02-08
MAPREDUCE-3375. [Gridmix] Memory Emulation system tests. MAPREDUCE-3375. [Gridmix] Memory Emulation system tests.
(Vinay Thota via amarrk) (Vinay Thota via amarrk)
MAPREDUCE-3840. JobEndNotifier doesn't use the proxyToUse during connecting
(Ravi Prakash via bobby)
MAPREDUCE-3840. JobEndNotifier doesn't use the proxyToUse during connecting
(Ravi Prakash via bobby)
MAPREDUCE-2733. [Gridmix] Gridmix3 cpu emulation system tests. MAPREDUCE-2733. [Gridmix] Gridmix3 cpu emulation system tests.
(Vinay Thota via amarrk) (Vinay Thota via amarrk)
@ -236,10 +248,6 @@ Release 0.23.1 - 2012-02-08
acmurthy) acmurthy)
BUG FIXES BUG FIXES
MAPREDUCE-3770. Zombie.getJobConf() results into NPE. (amarrk)
MAPREDUCE-3804. yarn webapp interface vulnerable to cross scripting attacks
(Dave Thompson via bobby)
MAPREDUCE-2784. [Gridmix] Bug fixes in ExecutionSummarizer and MAPREDUCE-2784. [Gridmix] Bug fixes in ExecutionSummarizer and
ResourceUsageMatcher. (amarrk) ResourceUsageMatcher. (amarrk)
@ -719,6 +727,9 @@ Release 0.23.1 - 2012-02-08
MAPREDUCE-3808. Fixed an NPE in FileOutputCommitter for jobs with maps MAPREDUCE-3808. Fixed an NPE in FileOutputCommitter for jobs with maps
but no reduces. (Robert Joseph Evans via vinodkv) but no reduces. (Robert Joseph Evans via vinodkv)
MAPREDUCE-3804. yarn webapp interface vulnerable to cross scripting attacks
(Dave Thompson via bobby)
MAPREDUCE-3354. Changed scripts so that jobhistory server is started by MAPREDUCE-3354. Changed scripts so that jobhistory server is started by
bin/mapred instead of bin/yarn. (Jonathan Eagles via acmurthy) bin/mapred instead of bin/yarn. (Jonathan Eagles via acmurthy)
@ -731,7 +742,6 @@ Release 0.23.1 - 2012-02-08
MAPREDUCE-3697. Support binary compatibility for Counters after MAPREDUCE-3697. Support binary compatibility for Counters after
MAPREDUCE-901. (mahadev via acmurthy) MAPREDUCE-901. (mahadev via acmurthy)
MAPREDUCE-3817. Fixed bin/mapred to allow running of distcp and archive MAPREDUCE-3817. Fixed bin/mapred to allow running of distcp and archive
jobs. (Arpit Gupta via acmurthy) jobs. (Arpit Gupta via acmurthy)
@ -768,6 +778,17 @@ Release 0.23.1 - 2012-02-08
MAPREDUCE-3828. Ensure that urls in single-node mode are correct. (sseth MAPREDUCE-3828. Ensure that urls in single-node mode are correct. (sseth
via acmurthy) via acmurthy)
MAPREDUCE-3770. Zombie.getJobConf() results into NPE. (amarrk)
MAPREDUCE-3843. Job summary log file found missing on the RM host
(Anupam Seth via tgraves)
MAPREDUCE-3846. Addressed MR AM hanging issues during AM restart and then
the recovery. (vinodkv)
MAPREDUCE-3802. Added test to validate that AM can crash multiple times and
still can recover successfully after MAPREDUCE-3846. (vinodkv)
Release 0.23.0 - 2011-11-01 Release 0.23.0 - 2011-11-01
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES
@ -2658,6 +2679,9 @@ Release 0.22.1 - Unreleased
BUG FIXES BUG FIXES
MAPREDUCE-3837. Job tracker is not able to recover jobs after crash.
(Mayank Bansal via shv)
Release 0.22.0 - 2011-11-29 Release 0.22.0 - 2011-11-29
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -20,6 +20,9 @@
# #
# Environment Variables # Environment Variables
# #
# HADOOP_LOGFILE Hadoop log file.
# HADOOP_ROOT_LOGGER Hadoop root logger.
# HADOOP_JHS_LOGGER Hadoop JobSummary logger.
# YARN_CONF_DIR Alternate conf dir. Default is ${YARN_HOME}/conf. # YARN_CONF_DIR Alternate conf dir. Default is ${YARN_HOME}/conf.
# YARN_LOG_DIR Where log files are stored. PWD by default. # YARN_LOG_DIR Where log files are stored. PWD by default.
# YARN_MASTER host:path where hadoop code should be rsync'd from # YARN_MASTER host:path where hadoop code should be rsync'd from
@ -86,8 +89,9 @@ if [ "$YARN_PID_DIR" = "" ]; then
fi fi
# some variables # some variables
export YARN_LOGFILE=yarn-$YARN_IDENT_STRING-$command-$HOSTNAME.log export HADOOP_LOGFILE=yarn-$YARN_IDENT_STRING-$command-$HOSTNAME.log
export YARN_ROOT_LOGGER=${YARN_ROOT_LOGGER:-INFO,DRFA} export HADOOP_ROOT_LOGGER=${HADOOP_ROOT_LOGGER:-INFO,DRFA}
export HADOOP_JHS_LOGGER=${HADOOP_JHS_LOGGER:-INFO,JSA}
log=$YARN_LOG_DIR/yarn-$YARN_IDENT_STRING-$command-$HOSTNAME.out log=$YARN_LOG_DIR/yarn-$YARN_IDENT_STRING-$command-$HOSTNAME.out
pid=$YARN_PID_DIR/yarn-$YARN_IDENT_STRING-$command.pid pid=$YARN_PID_DIR/yarn-$YARN_IDENT_STRING-$command.pid

View File

@ -244,7 +244,7 @@ public class JobHistoryEventHandler extends AbstractService
while (!stopped && !Thread.currentThread().isInterrupted()) { while (!stopped && !Thread.currentThread().isInterrupted()) {
// Log the size of the history-event-queue every so often. // Log the size of the history-event-queue every so often.
if (eventCounter % 1000 == 0) { if (eventCounter != 0 && eventCounter % 1000 == 0) {
eventCounter = 0; eventCounter = 0;
LOG.info("Size of the JobHistory event queue is " LOG.info("Size of the JobHistory event queue is "
+ eventQueue.size()); + eventQueue.size());
@ -464,8 +464,10 @@ public class JobHistoryEventHandler extends AbstractService
} }
processEventForJobSummary(event.getHistoryEvent(), mi.getJobSummary(), processEventForJobSummary(event.getHistoryEvent(), mi.getJobSummary(),
event.getJobID()); event.getJobID());
LOG.info("In HistoryEventHandler " if (LOG.isDebugEnabled()) {
+ event.getHistoryEvent().getEventType()); LOG.debug("In HistoryEventHandler "
+ event.getHistoryEvent().getEventType());
}
} catch (IOException e) { } catch (IOException e) {
LOG.error("Error writing History Event: " + event.getHistoryEvent(), LOG.error("Error writing History Event: " + event.getHistoryEvent(),
e); e);

View File

@ -26,7 +26,6 @@ import java.security.PrivilegedExceptionAction;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
@ -48,6 +47,7 @@ import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.AMStartedEvent; import org.apache.hadoop.mapreduce.jobhistory.AMStartedEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEventHandler; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEventHandler;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager; import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.mapreduce.v2.api.records.AMInfo; import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
@ -123,7 +123,7 @@ import org.apache.hadoop.yarn.util.ConverterUtils;
* The information is shared across different components using AppContext. * The information is shared across different components using AppContext.
*/ */
@SuppressWarnings("deprecation") @SuppressWarnings("rawtypes")
public class MRAppMaster extends CompositeService { public class MRAppMaster extends CompositeService {
private static final Log LOG = LogFactory.getLog(MRAppMaster.class); private static final Log LOG = LogFactory.getLog(MRAppMaster.class);
@ -138,7 +138,7 @@ public class MRAppMaster extends CompositeService {
private final int nmPort; private final int nmPort;
private final int nmHttpPort; private final int nmHttpPort;
protected final MRAppMetrics metrics; protected final MRAppMetrics metrics;
private Set<TaskId> completedTasksFromPreviousRun; private Map<TaskId, TaskInfo> completedTasksFromPreviousRun;
private List<AMInfo> amInfos; private List<AMInfo> amInfos;
private AppContext context; private AppContext context;
private Dispatcher dispatcher; private Dispatcher dispatcher;
@ -596,7 +596,7 @@ public class MRAppMaster extends CompositeService {
return dispatcher; return dispatcher;
} }
public Set<TaskId> getCompletedTaskFromPreviousRun() { public Map<TaskId, TaskInfo> getCompletedTaskFromPreviousRun() {
return completedTasksFromPreviousRun; return completedTasksFromPreviousRun;
} }
@ -737,7 +737,6 @@ public class MRAppMaster extends CompositeService {
return jobs; return jobs;
} }
@SuppressWarnings("rawtypes")
@Override @Override
public EventHandler getEventHandler() { public EventHandler getEventHandler() {
return dispatcher.getEventHandler(); return dispatcher.getEventHandler();

View File

@ -50,6 +50,7 @@ import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TypeConverter; import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.JobFinishedEvent; import org.apache.hadoop.mapreduce.jobhistory.JobFinishedEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobInfoChangeEvent; import org.apache.hadoop.mapreduce.jobhistory.JobInfoChangeEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobInitedEvent; import org.apache.hadoop.mapreduce.jobhistory.JobInitedEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobSubmittedEvent; import org.apache.hadoop.mapreduce.jobhistory.JobSubmittedEvent;
@ -133,7 +134,7 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
private float cleanupWeight = 0.05f; private float cleanupWeight = 0.05f;
private float mapWeight = 0.0f; private float mapWeight = 0.0f;
private float reduceWeight = 0.0f; private float reduceWeight = 0.0f;
private final Set<TaskId> completedTasksFromPreviousRun; private final Map<TaskId, TaskInfo> completedTasksFromPreviousRun;
private final List<AMInfo> amInfos; private final List<AMInfo> amInfos;
private final Lock readLock; private final Lock readLock;
private final Lock writeLock; private final Lock writeLock;
@ -376,7 +377,7 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
TaskAttemptListener taskAttemptListener, TaskAttemptListener taskAttemptListener,
JobTokenSecretManager jobTokenSecretManager, JobTokenSecretManager jobTokenSecretManager,
Credentials fsTokenCredentials, Clock clock, Credentials fsTokenCredentials, Clock clock,
Set<TaskId> completedTasksFromPreviousRun, MRAppMetrics metrics, Map<TaskId, TaskInfo> completedTasksFromPreviousRun, MRAppMetrics metrics,
OutputCommitter committer, boolean newApiCommitter, String userName, OutputCommitter committer, boolean newApiCommitter, String userName,
long appSubmitTime, List<AMInfo> amInfos) { long appSubmitTime, List<AMInfo> amInfos) {
this.applicationAttemptId = applicationAttemptId; this.applicationAttemptId = applicationAttemptId;

View File

@ -19,13 +19,14 @@
package org.apache.hadoop.mapreduce.v2.app.job.impl; package org.apache.hadoop.mapreduce.v2.app.job.impl;
import java.util.Collection; import java.util.Collection;
import java.util.Set; import java.util.Map;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapTaskAttemptImpl; import org.apache.hadoop.mapred.MapTaskAttemptImpl;
import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier; import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo; import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo;
import org.apache.hadoop.mapreduce.v2.api.records.JobId; import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@ -38,7 +39,7 @@ import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.yarn.Clock; import org.apache.hadoop.yarn.Clock;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
@SuppressWarnings({ "rawtypes", "deprecation" }) @SuppressWarnings({ "rawtypes" })
public class MapTaskImpl extends TaskImpl { public class MapTaskImpl extends TaskImpl {
private final TaskSplitMetaInfo taskSplitMetaInfo; private final TaskSplitMetaInfo taskSplitMetaInfo;
@ -49,7 +50,7 @@ public class MapTaskImpl extends TaskImpl {
TaskAttemptListener taskAttemptListener, OutputCommitter committer, TaskAttemptListener taskAttemptListener, OutputCommitter committer,
Token<JobTokenIdentifier> jobToken, Token<JobTokenIdentifier> jobToken,
Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock, Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock,
Set<TaskId> completedTasksFromPreviousRun, int startCount, Map<TaskId, TaskInfo> completedTasksFromPreviousRun, int startCount,
MRAppMetrics metrics) { MRAppMetrics metrics) {
super(jobId, TaskType.MAP, partition, eventHandler, remoteJobConfFile, super(jobId, TaskType.MAP, partition, eventHandler, remoteJobConfFile,
conf, taskAttemptListener, committer, jobToken, fsTokens, clock, conf, taskAttemptListener, committer, jobToken, fsTokens, clock,

View File

@ -19,13 +19,14 @@
package org.apache.hadoop.mapreduce.v2.app.job.impl; package org.apache.hadoop.mapreduce.v2.app.job.impl;
import java.util.Collection; import java.util.Collection;
import java.util.Set; import java.util.Map;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.ReduceTaskAttemptImpl; import org.apache.hadoop.mapred.ReduceTaskAttemptImpl;
import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier; import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
import org.apache.hadoop.mapreduce.v2.api.records.JobId; import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId; import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
@ -37,7 +38,7 @@ import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.yarn.Clock; import org.apache.hadoop.yarn.Clock;
import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.event.EventHandler;
@SuppressWarnings({ "rawtypes", "deprecation" }) @SuppressWarnings({ "rawtypes" })
public class ReduceTaskImpl extends TaskImpl { public class ReduceTaskImpl extends TaskImpl {
private final int numMapTasks; private final int numMapTasks;
@ -47,7 +48,7 @@ public class ReduceTaskImpl extends TaskImpl {
int numMapTasks, TaskAttemptListener taskAttemptListener, int numMapTasks, TaskAttemptListener taskAttemptListener,
OutputCommitter committer, Token<JobTokenIdentifier> jobToken, OutputCommitter committer, Token<JobTokenIdentifier> jobToken,
Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock, Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock,
Set<TaskId> completedTasksFromPreviousRun, int startCount, Map<TaskId, TaskInfo> completedTasksFromPreviousRun, int startCount,
MRAppMetrics metrics) { MRAppMetrics metrics) {
super(jobId, TaskType.REDUCE, partition, eventHandler, jobFile, conf, super(jobId, TaskType.REDUCE, partition, eventHandler, jobFile, conf,
taskAttemptListener, committer, jobToken, fsTokens, clock, taskAttemptListener, committer, jobToken, fsTokens, clock,

View File

@ -18,13 +18,14 @@
package org.apache.hadoop.mapreduce.v2.app.job.impl; package org.apache.hadoop.mapreduce.v2.app.job.impl;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
@ -35,8 +36,11 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TypeConverter; import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.jobhistory.TaskFailedEvent; import org.apache.hadoop.mapreduce.jobhistory.TaskFailedEvent;
import org.apache.hadoop.mapreduce.jobhistory.TaskFinishedEvent; import org.apache.hadoop.mapreduce.jobhistory.TaskFinishedEvent;
import org.apache.hadoop.mapreduce.jobhistory.TaskStartedEvent; import org.apache.hadoop.mapreduce.jobhistory.TaskStartedEvent;
@ -66,6 +70,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskTAttemptEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskTAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.metrics.MRAppMetrics; import org.apache.hadoop.mapreduce.v2.app.metrics.MRAppMetrics;
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerFailedEvent; import org.apache.hadoop.mapreduce.v2.app.rm.ContainerFailedEvent;
import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils;
import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier; import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.yarn.Clock; import org.apache.hadoop.yarn.Clock;
@ -208,8 +213,23 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
private final StateMachine<TaskState, TaskEventType, TaskEvent> private final StateMachine<TaskState, TaskEventType, TaskEvent>
stateMachine; stateMachine;
protected int nextAttemptNumber; // By default, the next TaskAttempt number is zero. Changes during recovery
protected int nextAttemptNumber = 0;
private List<TaskAttemptInfo> taskAttemptsFromPreviousGeneration =
new ArrayList<TaskAttemptInfo>();
private static final class RecoverdAttemptsComparator implements
Comparator<TaskAttemptInfo> {
@Override
public int compare(TaskAttemptInfo attempt1, TaskAttemptInfo attempt2) {
long diff = attempt1.getStartTime() - attempt2.getStartTime();
return diff == 0 ? 0 : (diff < 0 ? -1 : 1);
}
}
private static final RecoverdAttemptsComparator RECOVERED_ATTEMPTS_COMPARATOR =
new RecoverdAttemptsComparator();
//should be set to one which comes first //should be set to one which comes first
//saying COMMIT_PENDING //saying COMMIT_PENDING
@ -230,7 +250,7 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
TaskAttemptListener taskAttemptListener, OutputCommitter committer, TaskAttemptListener taskAttemptListener, OutputCommitter committer,
Token<JobTokenIdentifier> jobToken, Token<JobTokenIdentifier> jobToken,
Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock, Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock,
Set<TaskId> completedTasksFromPreviousRun, int startCount, Map<TaskId, TaskInfo> completedTasksFromPreviousRun, int startCount,
MRAppMetrics metrics) { MRAppMetrics metrics) {
this.conf = conf; this.conf = conf;
this.clock = clock; this.clock = clock;
@ -243,10 +263,7 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
// have a convention that none of the overrides depends on any // have a convention that none of the overrides depends on any
// fields that need initialization. // fields that need initialization.
maxAttempts = getMaxAttempts(); maxAttempts = getMaxAttempts();
taskId = recordFactory.newRecordInstance(TaskId.class); taskId = MRBuilderUtils.newTaskId(jobId, partition, taskType);
taskId.setJobId(jobId);
taskId.setId(partition);
taskId.setTaskType(taskType);
this.partition = partition; this.partition = partition;
this.taskAttemptListener = taskAttemptListener; this.taskAttemptListener = taskAttemptListener;
this.eventHandler = eventHandler; this.eventHandler = eventHandler;
@ -255,17 +272,37 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
this.jobToken = jobToken; this.jobToken = jobToken;
this.metrics = metrics; this.metrics = metrics;
// See if this is from a previous generation.
if (completedTasksFromPreviousRun != null if (completedTasksFromPreviousRun != null
&& completedTasksFromPreviousRun.contains(taskId)) { && completedTasksFromPreviousRun.containsKey(taskId)) {
// This task has TaskAttempts from previous generation. We have to replay
// them.
LOG.info("Task is from previous run " + taskId); LOG.info("Task is from previous run " + taskId);
startCount = startCount - 1; TaskInfo taskInfo = completedTasksFromPreviousRun.get(taskId);
Map<TaskAttemptID, TaskAttemptInfo> allAttempts =
taskInfo.getAllTaskAttempts();
taskAttemptsFromPreviousGeneration = new ArrayList<TaskAttemptInfo>();
taskAttemptsFromPreviousGeneration.addAll(allAttempts.values());
Collections.sort(taskAttemptsFromPreviousGeneration,
RECOVERED_ATTEMPTS_COMPARATOR);
} }
//attempt ids are generated based on MR app startCount so that attempts if (taskAttemptsFromPreviousGeneration.isEmpty()) {
//from previous lives don't overstep the current one. // All the previous attempts are exhausted, now start with a new
//this assumes that a task won't have more than 1000 attempts in its single // generation.
//life
nextAttemptNumber = (startCount - 1) * 1000; // All the new TaskAttemptIDs are generated based on MR
// ApplicationAttemptID so that attempts from previous lives don't
// over-step the current one. This assumes that a task won't have more
// than 1000 attempts in its single generation, which is very reasonable.
// Someone is nuts if he/she thinks he/she can live with 1000 TaskAttempts
// and requires serious medical attention.
nextAttemptNumber = (startCount - 1) * 1000;
} else {
// There are still some TaskAttempts from previous generation, use them
nextAttemptNumber =
taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId();
}
// This "this leak" is okay because the retained pointer is in an // This "this leak" is okay because the retained pointer is in an
// instance variable. // instance variable.
@ -390,17 +427,23 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
//this is always called in read/write lock //this is always called in read/write lock
private long getLaunchTime() { private long getLaunchTime() {
long launchTime = 0; long taskLaunchTime = 0;
boolean launchTimeSet = false;
for (TaskAttempt at : attempts.values()) { for (TaskAttempt at : attempts.values()) {
//select the least launch time of all attempts // select the least launch time of all attempts
if (launchTime == 0 || launchTime > at.getLaunchTime()) { long attemptLaunchTime = at.getLaunchTime();
launchTime = at.getLaunchTime(); if (attemptLaunchTime != 0 && !launchTimeSet) {
// For the first non-zero launch time
launchTimeSet = true;
taskLaunchTime = attemptLaunchTime;
} else if (attemptLaunchTime != 0 && taskLaunchTime > attemptLaunchTime) {
taskLaunchTime = attemptLaunchTime;
} }
} }
if (launchTime == 0) { if (!launchTimeSet) {
return this.scheduledTime; return this.scheduledTime;
} }
return launchTime; return taskLaunchTime;
} }
//this is always called in read/write lock //this is always called in read/write lock
@ -525,7 +568,16 @@ public abstract class TaskImpl implements Task, EventHandler<TaskEvent> {
attempts.put(attempt.getID(), attempt); attempts.put(attempt.getID(), attempt);
break; break;
} }
++nextAttemptNumber;
// Update nextATtemptNumber
if (taskAttemptsFromPreviousGeneration.isEmpty()) {
++nextAttemptNumber;
} else {
// There are still some TaskAttempts from previous generation, use them
nextAttemptNumber =
taskAttemptsFromPreviousGeneration.remove(0).getAttemptId().getId();
}
++numberUncompletedAttempts; ++numberUncompletedAttempts;
//schedule the nextAttemptNumber //schedule the nextAttemptNumber
if (failedAttempts > 0) { if (failedAttempts > 0) {

View File

@ -19,8 +19,9 @@
package org.apache.hadoop.mapreduce.v2.app.recover; package org.apache.hadoop.mapreduce.v2.app.recover;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Map;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.v2.api.records.AMInfo; import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId; import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
import org.apache.hadoop.yarn.Clock; import org.apache.hadoop.yarn.Clock;
@ -32,7 +33,7 @@ public interface Recovery {
Clock getClock(); Clock getClock();
Set<TaskId> getCompletedTasks(); Map<TaskId, TaskInfo> getCompletedTasks();
List<AMInfo> getAMInfos(); List<AMInfo> getAMInfos();
} }

View File

@ -24,7 +24,6 @@ import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -35,6 +34,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.TypeConverter; import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser;
@ -153,8 +153,8 @@ public class RecoveryService extends CompositeService implements Recovery {
} }
@Override @Override
public Set<TaskId> getCompletedTasks() { public Map<TaskId, TaskInfo> getCompletedTasks() {
return completedTasks.keySet(); return completedTasks;
} }
@Override @Override
@ -189,7 +189,8 @@ public class RecoveryService extends CompositeService implements Recovery {
getConfig()); getConfig());
//read the previous history file //read the previous history file
historyFile = fc.makeQualified(JobHistoryUtils.getStagingJobHistoryFile( historyFile = fc.makeQualified(JobHistoryUtils.getStagingJobHistoryFile(
histDirPath, jobName, (applicationAttemptId.getAttemptId() - 1))); histDirPath, jobName, (applicationAttemptId.getAttemptId() - 1)));
LOG.info("History file is at " + historyFile);
in = fc.open(historyFile); in = fc.open(historyFile);
JobHistoryParser parser = new JobHistoryParser(in); JobHistoryParser parser = new JobHistoryParser(in);
jobInfo = parser.parse(); jobInfo = parser.parse();
@ -242,7 +243,7 @@ public class RecoveryService extends CompositeService implements Recovery {
if (event.getType() == TaskAttemptEventType.TA_CONTAINER_LAUNCHED) { if (event.getType() == TaskAttemptEventType.TA_CONTAINER_LAUNCHED) {
TaskAttemptInfo attInfo = getTaskAttemptInfo(((TaskAttemptEvent) event) TaskAttemptInfo attInfo = getTaskAttemptInfo(((TaskAttemptEvent) event)
.getTaskAttemptID()); .getTaskAttemptID());
LOG.info("Attempt start time " + attInfo.getStartTime()); LOG.info("Recovered Attempt start time " + attInfo.getStartTime());
clock.setTime(attInfo.getStartTime()); clock.setTime(attInfo.getStartTime());
} else if (event.getType() == TaskAttemptEventType.TA_DONE } else if (event.getType() == TaskAttemptEventType.TA_DONE
@ -250,7 +251,7 @@ public class RecoveryService extends CompositeService implements Recovery {
|| event.getType() == TaskAttemptEventType.TA_KILL) { || event.getType() == TaskAttemptEventType.TA_KILL) {
TaskAttemptInfo attInfo = getTaskAttemptInfo(((TaskAttemptEvent) event) TaskAttemptInfo attInfo = getTaskAttemptInfo(((TaskAttemptEvent) event)
.getTaskAttemptID()); .getTaskAttemptID());
LOG.info("Attempt finish time " + attInfo.getFinishTime()); LOG.info("Recovered Attempt finish time " + attInfo.getFinishTime());
clock.setTime(attInfo.getFinishTime()); clock.setTime(attInfo.getFinishTime());
} }
@ -380,17 +381,17 @@ public class RecoveryService extends CompositeService implements Recovery {
} }
// send the done event // send the done event
LOG.info("Sending done event to " + aId); LOG.info("Sending done event to recovered attempt " + aId);
actualHandler.handle(new TaskAttemptEvent(aId, actualHandler.handle(new TaskAttemptEvent(aId,
TaskAttemptEventType.TA_DONE)); TaskAttemptEventType.TA_DONE));
break; break;
case KILLED: case KILLED:
LOG.info("Sending kill event to " + aId); LOG.info("Sending kill event to recovered attempt " + aId);
actualHandler.handle(new TaskAttemptEvent(aId, actualHandler.handle(new TaskAttemptEvent(aId,
TaskAttemptEventType.TA_KILL)); TaskAttemptEventType.TA_KILL));
break; break;
default: default:
LOG.info("Sending fail event to " + aId); LOG.info("Sending fail event to recovered attempt " + aId);
actualHandler.handle(new TaskAttemptEvent(aId, actualHandler.handle(new TaskAttemptEvent(aId,
TaskAttemptEventType.TA_FAILMSG)); TaskAttemptEventType.TA_FAILMSG));
break; break;

View File

@ -44,6 +44,7 @@ import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.mapreduce.v2.api.records.AMInfo; import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
import org.apache.hadoop.mapreduce.v2.api.records.JobState; import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState; import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskState; import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
import org.apache.hadoop.mapreduce.v2.app.job.Job; import org.apache.hadoop.mapreduce.v2.app.job.Job;
@ -52,6 +53,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType; import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncher; import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncher;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherEvent;
import org.apache.hadoop.mapreduce.v2.app.recover.Recovery; import org.apache.hadoop.mapreduce.v2.app.recover.Recovery;
import org.apache.hadoop.mapreduce.v2.app.recover.RecoveryService; import org.apache.hadoop.mapreduce.v2.app.recover.RecoveryService;
import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ReflectionUtils;
@ -74,7 +76,14 @@ public class TestRecovery {
private Text val1 = new Text("val1"); private Text val1 = new Text("val1");
private Text val2 = new Text("val2"); private Text val2 = new Text("val2");
/**
* AM with 2 maps and 1 reduce. For 1st map, one attempt fails, one attempt
* completely disappears because of failed launch, one attempt gets killed and
* one attempt succeeds. AM crashes after the first tasks finishes and
* recovers completely and succeeds in the second generation.
*
* @throws Exception
*/
@Test @Test
public void testCrashed() throws Exception { public void testCrashed() throws Exception {
@ -112,7 +121,8 @@ public class TestRecovery {
// reduces must be in NEW state // reduces must be in NEW state
Assert.assertEquals("Reduce Task state not correct", Assert.assertEquals("Reduce Task state not correct",
TaskState.RUNNING, reduceTask.getReport().getTaskState()); TaskState.RUNNING, reduceTask.getReport().getTaskState());
/////////// Play some games with the TaskAttempts of the first task //////
//send the fail signal to the 1st map task attempt //send the fail signal to the 1st map task attempt
app.getContext().getEventHandler().handle( app.getContext().getEventHandler().handle(
new TaskAttemptEvent( new TaskAttemptEvent(
@ -120,29 +130,31 @@ public class TestRecovery {
TaskAttemptEventType.TA_FAILMSG)); TaskAttemptEventType.TA_FAILMSG));
app.waitForState(task1Attempt1, TaskAttemptState.FAILED); app.waitForState(task1Attempt1, TaskAttemptState.FAILED);
while (mapTask1.getAttempts().size() != 2) { int timeOut = 0;
while (mapTask1.getAttempts().size() != 2 && timeOut++ < 10) {
Thread.sleep(2000); Thread.sleep(2000);
LOG.info("Waiting for next attempt to start"); LOG.info("Waiting for next attempt to start");
} }
Assert.assertEquals(2, mapTask1.getAttempts().size());
Iterator<TaskAttempt> itr = mapTask1.getAttempts().values().iterator(); Iterator<TaskAttempt> itr = mapTask1.getAttempts().values().iterator();
itr.next(); itr.next();
TaskAttempt task1Attempt2 = itr.next(); TaskAttempt task1Attempt2 = itr.next();
app.waitForState(task1Attempt2, TaskAttemptState.RUNNING); // This attempt will automatically fail because of the way ContainerLauncher
// is setup
//send the kill signal to the 1st map 2nd attempt // This attempt 'disappears' from JobHistory and so causes MAPREDUCE-3846
app.getContext().getEventHandler().handle( app.getContext().getEventHandler().handle(
new TaskAttemptEvent( new TaskAttemptEvent(task1Attempt2.getID(),
task1Attempt2.getID(), TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED));
TaskAttemptEventType.TA_KILL)); app.waitForState(task1Attempt2, TaskAttemptState.FAILED);
app.waitForState(task1Attempt2, TaskAttemptState.KILLED); timeOut = 0;
while (mapTask1.getAttempts().size() != 3 && timeOut++ < 10) {
while (mapTask1.getAttempts().size() != 3) {
Thread.sleep(2000); Thread.sleep(2000);
LOG.info("Waiting for next attempt to start"); LOG.info("Waiting for next attempt to start");
} }
Assert.assertEquals(3, mapTask1.getAttempts().size());
itr = mapTask1.getAttempts().values().iterator(); itr = mapTask1.getAttempts().values().iterator();
itr.next(); itr.next();
itr.next(); itr.next();
@ -150,12 +162,36 @@ public class TestRecovery {
app.waitForState(task1Attempt3, TaskAttemptState.RUNNING); app.waitForState(task1Attempt3, TaskAttemptState.RUNNING);
//send the done signal to the 1st map 3rd attempt //send the kill signal to the 1st map 3rd attempt
app.getContext().getEventHandler().handle( app.getContext().getEventHandler().handle(
new TaskAttemptEvent( new TaskAttemptEvent(
task1Attempt3.getID(), task1Attempt3.getID(),
TaskAttemptEventType.TA_KILL));
app.waitForState(task1Attempt3, TaskAttemptState.KILLED);
timeOut = 0;
while (mapTask1.getAttempts().size() != 4 && timeOut++ < 10) {
Thread.sleep(2000);
LOG.info("Waiting for next attempt to start");
}
Assert.assertEquals(4, mapTask1.getAttempts().size());
itr = mapTask1.getAttempts().values().iterator();
itr.next();
itr.next();
itr.next();
TaskAttempt task1Attempt4 = itr.next();
app.waitForState(task1Attempt4, TaskAttemptState.RUNNING);
//send the done signal to the 1st map 4th attempt
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(
task1Attempt4.getID(),
TaskAttemptEventType.TA_DONE)); TaskAttemptEventType.TA_DONE));
/////////// End of games with the TaskAttempts of the first task //////
//wait for first map task to complete //wait for first map task to complete
app.waitForState(mapTask1, TaskState.SUCCEEDED); app.waitForState(mapTask1, TaskState.SUCCEEDED);
long task1StartTime = mapTask1.getReport().getStartTime(); long task1StartTime = mapTask1.getReport().getStartTime();
@ -241,6 +277,136 @@ public class TestRecovery {
// available in the failed attempt should be available here // available in the failed attempt should be available here
} }
@Test
public void testMultipleCrashes() throws Exception {
int runCount = 0;
MRApp app =
new MRAppWithHistory(2, 1, false, this.getClass().getName(), true,
++runCount);
Configuration conf = new Configuration();
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean("mapred.reducer.new-api", true);
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
Job job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
//all maps would be running
Assert.assertEquals("No of tasks not correct",
3, job.getTasks().size());
Iterator<Task> it = job.getTasks().values().iterator();
Task mapTask1 = it.next();
Task mapTask2 = it.next();
Task reduceTask = it.next();
// all maps must be running
app.waitForState(mapTask1, TaskState.RUNNING);
app.waitForState(mapTask2, TaskState.RUNNING);
TaskAttempt task1Attempt1 = mapTask1.getAttempts().values().iterator().next();
TaskAttempt task2Attempt = mapTask2.getAttempts().values().iterator().next();
//before sending the TA_DONE, event make sure attempt has come to
//RUNNING state
app.waitForState(task1Attempt1, TaskAttemptState.RUNNING);
app.waitForState(task2Attempt, TaskAttemptState.RUNNING);
// reduces must be in NEW state
Assert.assertEquals("Reduce Task state not correct",
TaskState.RUNNING, reduceTask.getReport().getTaskState());
//send the done signal to the 1st map
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(
task1Attempt1.getID(),
TaskAttemptEventType.TA_DONE));
//wait for first map task to complete
app.waitForState(mapTask1, TaskState.SUCCEEDED);
// Crash the app
app.stop();
//rerun
//in rerun the 1st map will be recovered from previous run
app =
new MRAppWithHistory(2, 1, false, this.getClass().getName(), false,
++runCount);
conf = new Configuration();
conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean("mapred.reducer.new-api", true);
conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
//all maps would be running
Assert.assertEquals("No of tasks not correct",
3, job.getTasks().size());
it = job.getTasks().values().iterator();
mapTask1 = it.next();
mapTask2 = it.next();
reduceTask = it.next();
// first map will be recovered, no need to send done
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.RUNNING);
task2Attempt = mapTask2.getAttempts().values().iterator().next();
//before sending the TA_DONE, event make sure attempt has come to
//RUNNING state
app.waitForState(task2Attempt, TaskAttemptState.RUNNING);
//send the done signal to the 2nd map task
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(
mapTask2.getAttempts().values().iterator().next().getID(),
TaskAttemptEventType.TA_DONE));
//wait to get it completed
app.waitForState(mapTask2, TaskState.SUCCEEDED);
// Crash the app again.
app.stop();
//rerun
//in rerun the 1st and 2nd map will be recovered from previous run
app =
new MRAppWithHistory(2, 1, false, this.getClass().getName(), false,
++runCount);
conf = new Configuration();
conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean("mapred.reducer.new-api", true);
conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
//all maps would be running
Assert.assertEquals("No of tasks not correct",
3, job.getTasks().size());
it = job.getTasks().values().iterator();
mapTask1 = it.next();
mapTask2 = it.next();
reduceTask = it.next();
// The maps will be recovered, no need to send done
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.SUCCEEDED);
//wait for reduce to be running before sending done
app.waitForState(reduceTask, TaskState.RUNNING);
//send the done signal to the reduce
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(
reduceTask.getAttempts().values().iterator().next().getID(),
TaskAttemptEventType.TA_DONE));
app.waitForState(job, JobState.SUCCEEDED);
app.verifyCompleted();
}
@Test @Test
public void testOutputRecovery() throws Exception { public void testOutputRecovery() throws Exception {
int runCount = 0; int runCount = 0;
@ -552,7 +718,7 @@ public class TestRecovery {
} }
class MRAppWithHistory extends MRApp { static class MRAppWithHistory extends MRApp {
public MRAppWithHistory(int maps, int reduces, boolean autoComplete, public MRAppWithHistory(int maps, int reduces, boolean autoComplete,
String testName, boolean cleanOnStart, int startCount) { String testName, boolean cleanOnStart, int startCount) {
super(maps, reduces, autoComplete, testName, cleanOnStart, startCount); super(maps, reduces, autoComplete, testName, cleanOnStart, startCount);
@ -567,7 +733,17 @@ public class TestRecovery {
@Override @Override
protected ContainerLauncher createContainerLauncher(AppContext context) { protected ContainerLauncher createContainerLauncher(AppContext context) {
MockContainerLauncher launcher = new MockContainerLauncher(); MockContainerLauncher launcher = new MockContainerLauncher() {
@Override
public void handle(ContainerLauncherEvent event) {
TaskAttemptId taskAttemptID = event.getTaskAttemptID();
// Pass everything except the 2nd attempt of the first task.
if (taskAttemptID.getId() != 1
|| taskAttemptID.getTaskId().getId() != 0) {
super.handle(event);
}
}
};
launcher.shufflePort = 5467; launcher.shufflePort = 5467;
return launcher; return launcher;
} }
@ -581,7 +757,7 @@ public class TestRecovery {
} }
} }
class RecoveryServiceWithCustomDispatcher extends RecoveryService { static class RecoveryServiceWithCustomDispatcher extends RecoveryService {
public RecoveryServiceWithCustomDispatcher( public RecoveryServiceWithCustomDispatcher(
ApplicationAttemptId applicationAttemptId, Clock clock, ApplicationAttemptId applicationAttemptId, Clock clock,

View File

@ -25,7 +25,7 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Map;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -34,6 +34,7 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Task; import org.apache.hadoop.mapred.Task;
import org.apache.hadoop.mapred.TaskUmbilicalProtocol; import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier; import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo; import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo;
import org.apache.hadoop.mapreduce.v2.api.records.JobId; import org.apache.hadoop.mapreduce.v2.api.records.JobId;
@ -72,7 +73,7 @@ public class TestTaskImpl {
private Path remoteJobConfFile; private Path remoteJobConfFile;
private Collection<Token<? extends TokenIdentifier>> fsTokens; private Collection<Token<? extends TokenIdentifier>> fsTokens;
private Clock clock; private Clock clock;
private Set<TaskId> completedTasksFromPreviousRun; private Map<TaskId, TaskInfo> completedTasksFromPreviousRun;
private MRAppMetrics metrics; private MRAppMetrics metrics;
private TaskImpl mockTask; private TaskImpl mockTask;
private ApplicationId appId; private ApplicationId appId;
@ -96,7 +97,7 @@ public class TestTaskImpl {
TaskAttemptListener taskAttemptListener, OutputCommitter committer, TaskAttemptListener taskAttemptListener, OutputCommitter committer,
Token<JobTokenIdentifier> jobToken, Token<JobTokenIdentifier> jobToken,
Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock, Collection<Token<? extends TokenIdentifier>> fsTokens, Clock clock,
Set<TaskId> completedTasksFromPreviousRun, int startCount, Map<TaskId, TaskInfo> completedTasksFromPreviousRun, int startCount,
MRAppMetrics metrics) { MRAppMetrics metrics) {
super(jobId, taskType , partition, eventHandler, super(jobId, taskType , partition, eventHandler,
remoteJobConfFile, conf, taskAttemptListener, committer, remoteJobConfFile, conf, taskAttemptListener, committer,

View File

@ -52,7 +52,6 @@ import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
@SuppressWarnings("deprecation")
public class TypeConverter { public class TypeConverter {
private static RecordFactory recordFactory; private static RecordFactory recordFactory;
@ -116,8 +115,8 @@ public class TypeConverter {
} }
public static org.apache.hadoop.mapred.TaskID fromYarn(TaskId id) { public static org.apache.hadoop.mapred.TaskID fromYarn(TaskId id) {
return new org.apache.hadoop.mapred.TaskID(fromYarn(id.getJobId()), fromYarn(id.getTaskType()), return new org.apache.hadoop.mapred.TaskID(fromYarn(id.getJobId()),
id.getId()); fromYarn(id.getTaskType()), id.getId());
} }
public static TaskId toYarn(org.apache.hadoop.mapreduce.TaskID id) { public static TaskId toYarn(org.apache.hadoop.mapreduce.TaskID id) {

View File

@ -148,7 +148,12 @@ public class FifoScheduler implements ResourceScheduler {
QueueInfo queueInfo = recordFactory.newRecordInstance(QueueInfo.class); QueueInfo queueInfo = recordFactory.newRecordInstance(QueueInfo.class);
queueInfo.setQueueName(DEFAULT_QUEUE.getQueueName()); queueInfo.setQueueName(DEFAULT_QUEUE.getQueueName());
queueInfo.setCapacity(1.0f); queueInfo.setCapacity(1.0f);
queueInfo.setCurrentCapacity((float)usedResource.getMemory() / clusterResource.getMemory()); if (clusterResource.getMemory() == 0) {
queueInfo.setCurrentCapacity(0.0f);
} else {
queueInfo.setCurrentCapacity((float) usedResource.getMemory()
/ clusterResource.getMemory());
}
queueInfo.setMaximumCapacity(1.0f); queueInfo.setMaximumCapacity(1.0f);
queueInfo.setChildQueues(new ArrayList<QueueInfo>()); queueInfo.setChildQueues(new ArrayList<QueueInfo>());
queueInfo.setQueueState(QueueState.RUNNING); queueInfo.setQueueState(QueueState.RUNNING);

View File

@ -29,6 +29,7 @@ import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.server.resourcemanager.Application; import org.apache.hadoop.yarn.server.resourcemanager.Application;
@ -74,6 +75,13 @@ public class TestFifoScheduler {
.getRMContext()); .getRMContext());
} }
@Test
public void testFifoSchedulerCapacityWhenNoNMs() {
FifoScheduler scheduler = new FifoScheduler();
QueueInfo queueInfo = scheduler.getQueueInfo(null, false, false);
Assert.assertEquals(0.0f, queueInfo.getCurrentCapacity());
}
@Test @Test
public void testAppAttemptMetrics() throws Exception { public void testAppAttemptMetrics() throws Exception {
AsyncDispatcher dispatcher = new InlineDispatcher(); AsyncDispatcher dispatcher = new InlineDispatcher();

View File

@ -539,7 +539,7 @@ public class TestRMWebServices extends JerseyTest {
assertEquals("type doesn't match", "fifoScheduler", type); assertEquals("type doesn't match", "fifoScheduler", type);
assertEquals("qstate doesn't match", QueueState.RUNNING.toString(), state); assertEquals("qstate doesn't match", QueueState.RUNNING.toString(), state);
assertEquals("capacity doesn't match", 1.0, capacity, 0.0); assertEquals("capacity doesn't match", 1.0, capacity, 0.0);
assertEquals("usedCapacity doesn't match", Float.NaN, usedCapacity, 0.0); assertEquals("usedCapacity doesn't match", 0.0, usedCapacity, 0.0);
assertEquals("minQueueMemoryCapacity doesn't match", 1024, minQueueCapacity); assertEquals("minQueueMemoryCapacity doesn't match", 1024, minQueueCapacity);
assertEquals("maxQueueMemoryCapacity doesn't match", 10240, assertEquals("maxQueueMemoryCapacity doesn't match", 10240,
maxQueueCapacity); maxQueueCapacity);

View File

@ -437,32 +437,32 @@ Hadoop MapReduce Next Generation - Cluster Setup
Format a new distributed filesystem: Format a new distributed filesystem:
---- ----
$ $HADOOP_PREFIX_HOME/bin/hdfs namenode -format <cluster_name> $ $HADOOP_PREFIX/bin/hdfs namenode -format <cluster_name>
---- ----
Start the HDFS with the following command, run on the designated NameNode: Start the HDFS with the following command, run on the designated NameNode:
---- ----
$ $HADOOP_PREFIX_HOME/bin/hdfs start namenode --config $HADOOP_CONF_DIR $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode
---- ----
Run a script to start DataNodes on all slaves: Run a script to start DataNodes on all slaves:
---- ----
$ $HADOOP_PREFIX_HOME/bin/hdfs start datanode --config $HADOOP_CONF_DIR $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode
---- ----
Start the YARN with the following command, run on the designated Start the YARN with the following command, run on the designated
ResourceManager: ResourceManager:
---- ----
$ $YARN_HOME/bin/yarn start resourcemanager --config $HADOOP_CONF_DIR $ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager
---- ----
Run a script to start NodeManagers on all slaves: Run a script to start NodeManagers on all slaves:
---- ----
$ $YARN_HOME/bin/yarn start nodemanager --config $HADOOP_CONF_DIR $ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager
---- ----
Start a standalone WebAppProxy server. If multiple servers Start a standalone WebAppProxy server. If multiple servers
@ -476,7 +476,7 @@ Hadoop MapReduce Next Generation - Cluster Setup
designated server: designated server:
---- ----
$ $YARN_HOME/bin/mapred start historyserver --config $YARN_CONF_DIR $ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver --config $HADOOP_CONF_DIR
---- ----
* Hadoop Shutdown * Hadoop Shutdown
@ -485,26 +485,26 @@ Hadoop MapReduce Next Generation - Cluster Setup
NameNode: NameNode:
---- ----
$ $HADOOP_PREFIX_HOME/bin/hdfs stop namenode --config $HADOOP_CONF_DIR $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop namenode
---- ----
Run a script to stop DataNodes on all slaves: Run a script to stop DataNodes on all slaves:
---- ----
$ $HADOOP_PREFIX_HOME/bin/hdfs stop datanode --config $HADOOP_CONF_DIR $ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop datanode
---- ----
Stop the ResourceManager with the following command, run on the designated Stop the ResourceManager with the following command, run on the designated
ResourceManager: ResourceManager:
---- ----
$ $YARN_HOME/bin/yarn stop resourcemanager --config $HADOOP_CONF_DIR $ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop resourcemanager
---- ----
Run a script to stop NodeManagers on all slaves: Run a script to stop NodeManagers on all slaves:
---- ----
$ $YARN_HOME/bin/yarn stop nodemanager --config $HADOOP_CONF_DIR $ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop nodemanager
---- ----
Stop the WebAppProxy server. If multiple servers are used with load Stop the WebAppProxy server. If multiple servers are used with load
@ -519,7 +519,7 @@ Hadoop MapReduce Next Generation - Cluster Setup
designated server: designated server:
---- ----
$ $YARN_HOME/bin/mapred stop historyserver --config $YARN_CONF_DIR $ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh stop historyserver --config $HADOOP_CONF_DIR
---- ----
@ -978,34 +978,34 @@ KVNO Timestamp Principal
Format a new distributed filesystem as <hdfs>: Format a new distributed filesystem as <hdfs>:
---- ----
[hdfs]$ $HADOOP_PREFIX_HOME/bin/hdfs namenode -format <cluster_name> [hdfs]$ $HADOOP_PREFIX/bin/hdfs namenode -format <cluster_name>
---- ----
Start the HDFS with the following command, run on the designated NameNode Start the HDFS with the following command, run on the designated NameNode
as <hdfs>: as <hdfs>:
---- ----
[hdfs]$ $HADOOP_PREFIX_HOME/bin/hdfs start namenode --config $HADOOP_CONF_DIR [hdfs]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode
---- ----
Run a script to start DataNodes on all slaves as <root> with a special Run a script to start DataNodes on all slaves as <root> with a special
environment variable <<<HADOOP_SECURE_DN_USER>>> set to <hdfs>: environment variable <<<HADOOP_SECURE_DN_USER>>> set to <hdfs>:
---- ----
[root]$ HADOOP_SECURE_DN_USER=hdfs $HADOOP_PREFIX_HOME/bin/hdfs start datanode --config $HADOOP_CONF_DIR [root]$ HADOOP_SECURE_DN_USER=hdfs $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode
---- ----
Start the YARN with the following command, run on the designated Start the YARN with the following command, run on the designated
ResourceManager as <yarn>: ResourceManager as <yarn>:
---- ----
[yarn]$ $YARN_HOME/bin/yarn start resourcemanager --config $HADOOP_CONF_DIR [yarn]$ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start resourcemanager
---- ----
Run a script to start NodeManagers on all slaves as <yarn>: Run a script to start NodeManagers on all slaves as <yarn>:
---- ----
[yarn]$ $YARN_HOME/bin/yarn start nodemanager --config $HADOOP_CONF_DIR [yarn]$ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR start nodemanager
---- ----
Start a standalone WebAppProxy server. Run on the WebAppProxy Start a standalone WebAppProxy server. Run on the WebAppProxy
@ -1020,7 +1020,7 @@ KVNO Timestamp Principal
designated server as <mapred>: designated server as <mapred>:
---- ----
[mapred]$ $YARN_HOME/bin/mapred start historyserver --config $YARN_CONF_DIR [mapred]$ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver --config $HADOOP_CONF_DIR
---- ----
* Hadoop Shutdown * Hadoop Shutdown
@ -1029,26 +1029,26 @@ KVNO Timestamp Principal
as <hdfs>: as <hdfs>:
---- ----
[hdfs]$ $HADOOP_PREFIX_HOME/bin/hdfs stop namenode --config $HADOOP_CONF_DIR [hdfs]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop namenode
---- ----
Run a script to stop DataNodes on all slaves as <root>: Run a script to stop DataNodes on all slaves as <root>:
---- ----
[root]$ $HADOOP_PREFIX_HOME/bin/hdfs stop datanode --config $HADOOP_CONF_DIR [root]$ $HADOOP_PREFIX/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs stop datanode
---- ----
Stop the ResourceManager with the following command, run on the designated Stop the ResourceManager with the following command, run on the designated
ResourceManager as <yarn>: ResourceManager as <yarn>:
---- ----
[yarn]$ $YARN_HOME/bin/yarn stop resourcemanager --config $HADOOP_CONF_DIR [yarn]$ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop resourcemanager
---- ----
Run a script to stop NodeManagers on all slaves as <yarn>: Run a script to stop NodeManagers on all slaves as <yarn>:
---- ----
[yarn]$ $YARN_HOME/bin/yarn stop nodemanager --config $HADOOP_CONF_DIR [yarn]$ $YARN_HOME/sbin/yarn-daemon.sh --config $HADOOP_CONF_DIR stop nodemanager
---- ----
Stop the WebAppProxy server. Run on the WebAppProxy server as Stop the WebAppProxy server. Run on the WebAppProxy server as
@ -1063,7 +1063,7 @@ KVNO Timestamp Principal
designated server as <mapred>: designated server as <mapred>:
---- ----
[mapred]$ $YARN_HOME/bin/mapred stop historyserver --config $YARN_CONF_DIR [mapred]$ $HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh stop historyserver --config $HADOOP_CONF_DIR
---- ----
* {Web Interfaces} * {Web Interfaces}

View File

@ -1192,13 +1192,17 @@ public class JobTracker implements MRConstants, InterTrackerProtocol,
try { try {
Path jobInfoFile = getSystemFileForJob(jobId); Path jobInfoFile = getSystemFileForJob(jobId);
FSDataInputStream in = fs.open(jobInfoFile); FSDataInputStream in = fs.open(jobInfoFile);
JobInfo token = new JobInfo(); final JobInfo token = new JobInfo();
token.readFields(in); token.readFields(in);
in.close(); in.close();
UserGroupInformation ugi = final UserGroupInformation ugi =
UserGroupInformation.createRemoteUser(token.getUser().toString()); UserGroupInformation.createRemoteUser(token.getUser().toString());
submitJob(token.getJobID(), restartCount, ugi.doAs(new PrivilegedExceptionAction<JobStatus>() {
ugi, token.getJobSubmitDir().toString(), true, null); public JobStatus run() throws IOException ,InterruptedException{
return submitJob(token.getJobID(), restartCount,
ugi, token.getJobSubmitDir().toString(), true, null);
}});
recovered++; recovered++;
} catch (Exception e) { } catch (Exception e) {
LOG.warn("Could not recover job " + jobId, e); LOG.warn("Could not recover job " + jobId, e);

View File

@ -733,6 +733,25 @@
</pluginManagement> </pluginManagement>
<plugins> <plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<id>create-testdirs</id>
<phase>validate</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<mkdir dir="${test.build.dir}"/>
<mkdir dir="${test.build.data}"/>
</target>
</configuration>
</execution>
</executions>
</plugin>
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId> <artifactId>maven-enforcer-plugin</artifactId>

View File

@ -55,6 +55,7 @@
<menu name="HDFS" inherit="top"> <menu name="HDFS" inherit="top">
<item name="Federation" href="hadoop-yarn/hadoop-yarn-site/Federation.html"/> <item name="Federation" href="hadoop-yarn/hadoop-yarn-site/Federation.html"/>
<item name="WebHDFS REST API" href="hadoop-yarn/hadoop-yarn-site/WebHDFS.html"/> <item name="WebHDFS REST API" href="hadoop-yarn/hadoop-yarn-site/WebHDFS.html"/>
<item name="HttpFS Gateway" href="hadoop-hdfs-httpfs/index.html"/>
</menu> </menu>
<menu name="MapReduce" inherit="top"> <menu name="MapReduce" inherit="top">