MAPREDUCE-5875. Make Counter limits consistent across JobClient, MRAppMaster, and YarnChild. (Gera Shegalov via kasha)

This commit is contained in:
Karthik Kambatla 2014-10-11 22:48:47 -07:00
parent ac64ff77cf
commit e8a31f2e1c
8 changed files with 127 additions and 22 deletions

View File

@ -420,6 +420,9 @@ Release 2.6.0 - UNRELEASED
MAPREDUCE-6123. TestCombineFileInputFormat incorrectly starts 2
MiniDFSCluster instances. (cnauroth)
MAPREDUCE-5875. Make Counter limits consistent across JobClient,
MRAppMaster, and YarnChild. (Gera Shegalov via kasha)
Release 2.5.1 - 2014-09-05
INCOMPATIBLE CHANGES

View File

@ -54,6 +54,7 @@ import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.jobhistory.AMStartedEvent;
import org.apache.hadoop.mapreduce.jobhistory.EventReader;
import org.apache.hadoop.mapreduce.jobhistory.EventType;
@ -1107,6 +1108,8 @@ public class MRAppMaster extends CompositeService {
// finally set the job classloader
MRApps.setClassLoader(jobClassLoader, getConfig());
// set job classloader if configured
Limits.init(getConfig());
if (initFailed) {
JobEvent initFailedEvent = new JobEvent(job.getID(), JobEventType.JOB_INIT_FAILED);

View File

@ -182,15 +182,15 @@ public class Cluster {
public Job getJob(JobID jobId) throws IOException, InterruptedException {
JobStatus status = client.getJobStatus(jobId);
if (status != null) {
JobConf conf;
final JobConf conf = new JobConf();
final Path jobPath = new Path(client.getFilesystemName(),
status.getJobFile());
final FileSystem fs = FileSystem.get(jobPath.toUri(), getConf());
try {
conf = new JobConf(status.getJobFile());
} catch (RuntimeException ex) {
// If job file doesn't exist it means we can't find the job
if (ex.getCause() instanceof FileNotFoundException) {
return null;
} else {
throw ex;
conf.addResource(fs.open(jobPath), jobPath.toString());
} catch (FileNotFoundException fnf) {
if (LOG.isWarnEnabled()) {
LOG.warn("Job conf missing on cluster", fnf);
}
}
return Job.getInstance(this, status, conf);

View File

@ -50,6 +50,7 @@ import org.apache.hadoop.mapred.QueueACL;
import static org.apache.hadoop.mapred.QueueManager.toFullPropertyName;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.protocol.ClientProtocol;
@ -437,6 +438,7 @@ class JobSubmitter {
// Write job file to submit dir
writeConf(conf, submitJobFile);
Limits.reset(conf);
//
// Now, actually submit the job (using the submit name)

View File

@ -123,4 +123,9 @@ public class Limits {
public synchronized LimitExceededException violation() {
return firstViolation;
}
public static synchronized void reset(Configuration conf) {
isInited = false;
init(conf);
}
}

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.mapreduce.jobhistory;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.Format;
@ -29,6 +30,8 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@ -41,6 +44,7 @@ import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.util.HostUtil;
@ -54,7 +58,8 @@ import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class HistoryViewer {
private static SimpleDateFormat dateFormat =
private static final Log LOG = LogFactory.getLog(HistoryViewer.class);
private static final SimpleDateFormat dateFormat =
new SimpleDateFormat("d-MMM-yyyy HH:mm:ss");
private FileSystem fs;
private JobInfo job;
@ -83,6 +88,17 @@ public class HistoryViewer {
System.err.println("Ignore unrecognized file: " + jobFile.getName());
throw new IOException(errorMsg);
}
final Path jobConfPath = new Path(jobFile.getParent(), jobDetails[0]
+ "_" + jobDetails[1] + "_" + jobDetails[2] + "_conf.xml");
final Configuration jobConf = new Configuration(conf);
try {
jobConf.addResource(fs.open(jobConfPath), jobConfPath.toString());
Limits.reset(conf);
} catch (FileNotFoundException fnf) {
if (LOG.isWarnEnabled()) {
LOG.warn("Missing job conf in history", fnf);
}
}
JobHistoryParser parser = new JobHistoryParser(fs, jobFile);
job = parser.parse();
jobId = job.getJobId().toString();

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.mapreduce.v2.hs;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
@ -34,6 +35,7 @@ import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobACLsManager;
import org.apache.hadoop.mapred.TaskCompletionEvent;
@ -41,6 +43,7 @@ import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.JobACL;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
@ -331,9 +334,21 @@ public class CompletedJob implements org.apache.hadoop.mapreduce.v2.app.job.Job
if (historyFileAbsolute != null) {
JobHistoryParser parser = null;
try {
final FileSystem fs = historyFileAbsolute.getFileSystem(conf);
parser =
new JobHistoryParser(historyFileAbsolute.getFileSystem(conf),
historyFileAbsolute);
final Path jobConfPath = new Path(historyFileAbsolute.getParent(),
JobHistoryUtils.getIntermediateConfFileName(jobId));
final Configuration conf = new Configuration();
try {
conf.addResource(fs.open(jobConfPath), jobConfPath.toString());
Limits.reset(conf);
} catch (FileNotFoundException fnf) {
if (LOG.isWarnEnabled()) {
LOG.warn("Missing job conf in history", fnf);
}
}
this.jobInfo = parser.parse();
} catch (IOException e) {
throw new YarnRuntimeException("Could not load history file "

View File

@ -53,10 +53,14 @@ import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TaskLog;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
@ -105,6 +109,7 @@ public class TestMRJobs {
EnumSet.of(RMAppState.FINISHED, RMAppState.FAILED, RMAppState.KILLED);
private static final int NUM_NODE_MGRS = 3;
private static final String TEST_IO_SORT_MB = "11";
private static final String TEST_GROUP_MAX = "200";
protected static MiniMRYarnCluster mrCluster;
protected static MiniDFSCluster dfsCluster;
@ -213,31 +218,58 @@ public class TestMRJobs {
}
@Test(timeout = 300000)
public void testJobClassloader() throws IOException, InterruptedException,
ClassNotFoundException {
testJobClassloader(false);
public void testConfVerificationWithClassloader() throws Exception {
testConfVerification(true, false, false, false);
}
@Test(timeout = 300000)
public void testJobClassloaderWithCustomClasses() throws IOException,
InterruptedException, ClassNotFoundException {
testJobClassloader(true);
public void testConfVerificationWithClassloaderCustomClasses()
throws Exception {
testConfVerification(true, true, false, false);
}
private void testJobClassloader(boolean useCustomClasses) throws IOException,
InterruptedException, ClassNotFoundException {
LOG.info("\n\n\nStarting testJobClassloader()"
+ " useCustomClasses=" + useCustomClasses);
@Test(timeout = 300000)
public void testConfVerificationWithOutClassloader() throws Exception {
testConfVerification(false, false, false, false);
}
@Test(timeout = 300000)
public void testConfVerificationWithJobClient() throws Exception {
testConfVerification(false, false, true, false);
}
@Test(timeout = 300000)
public void testConfVerificationWithJobClientLocal() throws Exception {
testConfVerification(false, false, true, true);
}
private void testConfVerification(boolean useJobClassLoader,
boolean useCustomClasses, boolean useJobClientForMonitring,
boolean useLocal) throws Exception {
LOG.info("\n\n\nStarting testConfVerification()"
+ " jobClassloader=" + useJobClassLoader
+ " customClasses=" + useCustomClasses
+ " jobClient=" + useJobClientForMonitring
+ " localMode=" + useLocal);
if (!(new File(MiniMRYarnCluster.APPJAR)).exists()) {
LOG.info("MRAppJar " + MiniMRYarnCluster.APPJAR
+ " not found. Not running test.");
return;
}
final Configuration sleepConf = new Configuration(mrCluster.getConfig());
final Configuration clusterConfig;
if (useLocal) {
clusterConfig = new Configuration();
conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.LOCAL_FRAMEWORK_NAME);
} else {
clusterConfig = mrCluster.getConfig();
}
final JobClient jc = new JobClient(clusterConfig);
final Configuration sleepConf = new Configuration(clusterConfig);
// set master address to local to test that local mode applied iff framework == local
sleepConf.set(MRConfig.MASTER_ADDRESS, "local");
sleepConf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, true);
sleepConf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER,
useJobClassLoader);
if (useCustomClasses) {
// to test AM loading user classes such as output format class, we want
// to blacklist them from the system classes (they need to be prepended
@ -255,6 +287,7 @@ public class TestMRJobs {
sleepConf.set(MRJobConfig.MAP_LOG_LEVEL, Level.ALL.toString());
sleepConf.set(MRJobConfig.REDUCE_LOG_LEVEL, Level.ALL.toString());
sleepConf.set(MRJobConfig.MAP_JAVA_OPTS, "-verbose:class");
sleepConf.set(MRJobConfig.COUNTER_GROUPS_MAX_KEY, TEST_GROUP_MAX);
final SleepJob sleepJob = new SleepJob();
sleepJob.setConf(sleepConf);
final Job job = sleepJob.createJob(1, 1, 10, 1, 10, 1);
@ -272,7 +305,26 @@ public class TestMRJobs {
jobConf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true);
}
job.submit();
boolean succeeded = job.waitForCompletion(true);
final boolean succeeded;
if (useJobClientForMonitring && !useLocal) {
// We can't use getJobID in useLocal case because JobClient and Job
// point to different instances of LocalJobRunner
//
final JobID mapredJobID = JobID.downgrade(job.getJobID());
RunningJob runningJob = null;
do {
Thread.sleep(10);
runningJob = jc.getJob(mapredJobID);
} while (runningJob == null);
Assert.assertEquals("Unexpected RunningJob's "
+ MRJobConfig.COUNTER_GROUPS_MAX_KEY,
TEST_GROUP_MAX, runningJob.getConfiguration()
.get(MRJobConfig.COUNTER_GROUPS_MAX_KEY));
runningJob.waitForCompletion();
succeeded = runningJob.isSuccessful();
} else {
succeeded = job.waitForCompletion(true);
}
Assert.assertTrue("Job status: " + job.getStatus().getFailureInfo(),
succeeded);
}
@ -925,5 +977,14 @@ public class TestMRJobs {
+ ", actual: " + ioSortMb);
}
}
@Override
public void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
super.map(key, value, context);
for (int i = 0; i < 100; i++) {
context.getCounter("testCounterGroup-" + i,
"testCounter").increment(1);
}
}
}
}