MAPREDUCE-3512. Batching JobHistory flushing to DFS so that we don't flush for every event slowing down AM. Contributed by Siddarth Seth.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1230353 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0086014703
commit
74697f2317
|
@ -201,6 +201,9 @@ Release 0.23.1 - Unreleased
|
||||||
MAPREDUCE-3618. Fixed TaskHeartbeatHandler to not hold a global lock for all
|
MAPREDUCE-3618. Fixed TaskHeartbeatHandler to not hold a global lock for all
|
||||||
task-updates. (Siddarth Seth via vinodkv)
|
task-updates. (Siddarth Seth via vinodkv)
|
||||||
|
|
||||||
|
MAPREDUCE-3512. Batching JobHistory flushing to DFS so that we don't flush
|
||||||
|
for every event slowing down AM. (Siddarth Seth via vinodkv)
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
||||||
MAPREDUCE-3221. Reenabled the previously ignored test in TestSubmitJob
|
MAPREDUCE-3221. Reenabled the previously ignored test in TestSubmitJob
|
||||||
|
|
|
@ -20,9 +20,12 @@ package org.apache.hadoop.mapreduce.jobhistory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.EnumSet;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Timer;
|
||||||
|
import java.util.TimerTask;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
|
@ -70,13 +73,20 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
private FileSystem stagingDirFS; // log Dir FileSystem
|
private FileSystem stagingDirFS; // log Dir FileSystem
|
||||||
private FileSystem doneDirFS; // done Dir FileSystem
|
private FileSystem doneDirFS; // done Dir FileSystem
|
||||||
|
|
||||||
private Configuration conf;
|
|
||||||
|
|
||||||
private Path stagingDirPath = null;
|
private Path stagingDirPath = null;
|
||||||
private Path doneDirPrefixPath = null; // folder for completed jobs
|
private Path doneDirPrefixPath = null; // folder for completed jobs
|
||||||
|
|
||||||
|
private int maxUnflushedCompletionEvents;
|
||||||
|
private int postJobCompletionMultiplier;
|
||||||
|
private long flushTimeout;
|
||||||
|
private int minQueueSizeForBatchingFlushes; // TODO: Rename
|
||||||
|
|
||||||
private BlockingQueue<JobHistoryEvent> eventQueue =
|
private int numUnflushedCompletionEvents = 0;
|
||||||
|
private boolean isTimerActive;
|
||||||
|
|
||||||
|
|
||||||
|
protected BlockingQueue<JobHistoryEvent> eventQueue =
|
||||||
new LinkedBlockingQueue<JobHistoryEvent>();
|
new LinkedBlockingQueue<JobHistoryEvent>();
|
||||||
protected Thread eventHandlingThread;
|
protected Thread eventHandlingThread;
|
||||||
private volatile boolean stopped;
|
private volatile boolean stopped;
|
||||||
|
@ -103,8 +113,6 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
@Override
|
@Override
|
||||||
public void init(Configuration conf) {
|
public void init(Configuration conf) {
|
||||||
|
|
||||||
this.conf = conf;
|
|
||||||
|
|
||||||
String stagingDirStr = null;
|
String stagingDirStr = null;
|
||||||
String doneDirStr = null;
|
String doneDirStr = null;
|
||||||
String userDoneDirStr = null;
|
String userDoneDirStr = null;
|
||||||
|
@ -184,6 +192,27 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
throw new YarnException(e);
|
throw new YarnException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Maximum number of unflushed completion-events that can stay in the queue
|
||||||
|
// before flush kicks in.
|
||||||
|
maxUnflushedCompletionEvents =
|
||||||
|
conf.getInt(MRJobConfig.MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS,
|
||||||
|
MRJobConfig.DEFAULT_MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS);
|
||||||
|
// We want to cut down flushes after job completes so as to write quicker,
|
||||||
|
// so we increase maxUnflushedEvents post Job completion by using the
|
||||||
|
// following multiplier.
|
||||||
|
postJobCompletionMultiplier =
|
||||||
|
conf.getInt(
|
||||||
|
MRJobConfig.MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER,
|
||||||
|
MRJobConfig.DEFAULT_MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER);
|
||||||
|
// Max time until which flush doesn't take place.
|
||||||
|
flushTimeout =
|
||||||
|
conf.getLong(MRJobConfig.MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS,
|
||||||
|
MRJobConfig.DEFAULT_MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS);
|
||||||
|
minQueueSizeForBatchingFlushes =
|
||||||
|
conf.getInt(
|
||||||
|
MRJobConfig.MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD,
|
||||||
|
MRJobConfig.DEFAULT_MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD);
|
||||||
|
|
||||||
super.init(conf);
|
super.init(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -256,14 +285,28 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
stopped = true;
|
stopped = true;
|
||||||
//do not interrupt while event handling is in progress
|
//do not interrupt while event handling is in progress
|
||||||
synchronized(lock) {
|
synchronized(lock) {
|
||||||
|
if (eventHandlingThread != null)
|
||||||
eventHandlingThread.interrupt();
|
eventHandlingThread.interrupt();
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
if (eventHandlingThread != null)
|
||||||
eventHandlingThread.join();
|
eventHandlingThread.join();
|
||||||
} catch (InterruptedException ie) {
|
} catch (InterruptedException ie) {
|
||||||
LOG.info("Interruped Exception while stopping", ie);
|
LOG.info("Interruped Exception while stopping", ie);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cancel all timers - so that they aren't invoked during or after
|
||||||
|
// the metaInfo object is wrapped up.
|
||||||
|
for (MetaInfo mi : fileMap.values()) {
|
||||||
|
try {
|
||||||
|
mi.shutDownTimer();
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.info("Exception while cancelling delayed flush timer. "
|
||||||
|
+ "Likely caused by a failed flush " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//write all the events remaining in queue
|
//write all the events remaining in queue
|
||||||
Iterator<JobHistoryEvent> it = eventQueue.iterator();
|
Iterator<JobHistoryEvent> it = eventQueue.iterator();
|
||||||
while(it.hasNext()) {
|
while(it.hasNext()) {
|
||||||
|
@ -284,6 +327,12 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
super.stop();
|
super.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected EventWriter createEventWriter(Path historyFilePath)
|
||||||
|
throws IOException {
|
||||||
|
FSDataOutputStream out = stagingDirFS.create(historyFilePath, true);
|
||||||
|
return new EventWriter(out);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create an event writer for the Job represented by the jobID.
|
* Create an event writer for the Job represented by the jobID.
|
||||||
* Writes out the job configuration to the log directory.
|
* Writes out the job configuration to the log directory.
|
||||||
|
@ -319,8 +368,7 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
JobHistoryUtils.getStagingConfFile(stagingDirPath, jobId, startCount);
|
JobHistoryUtils.getStagingConfFile(stagingDirPath, jobId, startCount);
|
||||||
if (writer == null) {
|
if (writer == null) {
|
||||||
try {
|
try {
|
||||||
FSDataOutputStream out = stagingDirFS.create(historyFile, true);
|
writer = createEventWriter(historyFile);
|
||||||
writer = new EventWriter(out);
|
|
||||||
LOG.info("Event Writer setup for JobId: " + jobId + ", File: "
|
LOG.info("Event Writer setup for JobId: " + jobId + ", File: "
|
||||||
+ historyFile);
|
+ historyFile);
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
|
@ -371,12 +419,26 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
@Override
|
@Override
|
||||||
public void handle(JobHistoryEvent event) {
|
public void handle(JobHistoryEvent event) {
|
||||||
try {
|
try {
|
||||||
|
if (isJobCompletionEvent(event.getHistoryEvent())) {
|
||||||
|
// When the job is complete, flush slower but write faster.
|
||||||
|
maxUnflushedCompletionEvents =
|
||||||
|
maxUnflushedCompletionEvents * postJobCompletionMultiplier;
|
||||||
|
}
|
||||||
|
|
||||||
eventQueue.put(event);
|
eventQueue.put(event);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
throw new YarnException(e);
|
throw new YarnException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isJobCompletionEvent(HistoryEvent historyEvent) {
|
||||||
|
if (EnumSet.of(EventType.JOB_FINISHED, EventType.JOB_FAILED,
|
||||||
|
EventType.JOB_KILLED).contains(historyEvent.getEventType())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
protected void handleEvent(JobHistoryEvent event) {
|
protected void handleEvent(JobHistoryEvent event) {
|
||||||
synchronized (lock) {
|
synchronized (lock) {
|
||||||
|
|
||||||
|
@ -615,32 +677,81 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class FlushTimerTask extends TimerTask {
|
||||||
|
private MetaInfo metaInfo;
|
||||||
|
private IOException ioe = null;
|
||||||
|
private volatile boolean shouldRun = true;
|
||||||
|
|
||||||
|
FlushTimerTask(MetaInfo metaInfo) {
|
||||||
|
this.metaInfo = metaInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
synchronized (lock) {
|
||||||
|
try {
|
||||||
|
if (!metaInfo.isTimerShutDown() && shouldRun)
|
||||||
|
metaInfo.flush();
|
||||||
|
} catch (IOException e) {
|
||||||
|
ioe = e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public IOException getException() {
|
||||||
|
return ioe;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void stop() {
|
||||||
|
shouldRun = false;
|
||||||
|
this.cancel();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private class MetaInfo {
|
private class MetaInfo {
|
||||||
private Path historyFile;
|
private Path historyFile;
|
||||||
private Path confFile;
|
private Path confFile;
|
||||||
private EventWriter writer;
|
private EventWriter writer;
|
||||||
JobIndexInfo jobIndexInfo;
|
JobIndexInfo jobIndexInfo;
|
||||||
JobSummary jobSummary;
|
JobSummary jobSummary;
|
||||||
|
Timer flushTimer;
|
||||||
|
FlushTimerTask flushTimerTask;
|
||||||
|
private boolean isTimerShutDown = false;
|
||||||
|
|
||||||
MetaInfo(Path historyFile, Path conf, EventWriter writer,
|
MetaInfo(Path historyFile, Path conf, EventWriter writer, String user,
|
||||||
String user, String jobName, JobId jobId) {
|
String jobName, JobId jobId) {
|
||||||
this.historyFile = historyFile;
|
this.historyFile = historyFile;
|
||||||
this.confFile = conf;
|
this.confFile = conf;
|
||||||
this.writer = writer;
|
this.writer = writer;
|
||||||
this.jobIndexInfo = new JobIndexInfo(-1, -1, user, jobName, jobId, -1, -1,
|
this.jobIndexInfo =
|
||||||
null);
|
new JobIndexInfo(-1, -1, user, jobName, jobId, -1, -1, null);
|
||||||
this.jobSummary = new JobSummary();
|
this.jobSummary = new JobSummary();
|
||||||
|
this.flushTimer = new Timer("FlushTimer", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
Path getHistoryFile() { return historyFile; }
|
Path getHistoryFile() {
|
||||||
|
return historyFile;
|
||||||
|
}
|
||||||
|
|
||||||
Path getConfFile() {return confFile; }
|
Path getConfFile() {
|
||||||
|
return confFile;
|
||||||
|
}
|
||||||
|
|
||||||
JobIndexInfo getJobIndexInfo() { return jobIndexInfo; }
|
JobIndexInfo getJobIndexInfo() {
|
||||||
|
return jobIndexInfo;
|
||||||
|
}
|
||||||
|
|
||||||
JobSummary getJobSummary() { return jobSummary; }
|
JobSummary getJobSummary() {
|
||||||
|
return jobSummary;
|
||||||
|
}
|
||||||
|
|
||||||
boolean isWriterActive() {return writer != null ; }
|
boolean isWriterActive() {
|
||||||
|
return writer != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isTimerShutDown() {
|
||||||
|
return isTimerShutDown;
|
||||||
|
}
|
||||||
|
|
||||||
void closeWriter() throws IOException {
|
void closeWriter() throws IOException {
|
||||||
synchronized (lock) {
|
synchronized (lock) {
|
||||||
|
@ -655,7 +766,67 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
synchronized (lock) {
|
synchronized (lock) {
|
||||||
if (writer != null) {
|
if (writer != null) {
|
||||||
writer.write(event);
|
writer.write(event);
|
||||||
|
processEventForFlush(event);
|
||||||
|
maybeFlush(event);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void processEventForFlush(HistoryEvent historyEvent) throws IOException {
|
||||||
|
if (EnumSet.of(EventType.MAP_ATTEMPT_FINISHED,
|
||||||
|
EventType.MAP_ATTEMPT_FAILED, EventType.MAP_ATTEMPT_KILLED,
|
||||||
|
EventType.REDUCE_ATTEMPT_FINISHED, EventType.REDUCE_ATTEMPT_FAILED,
|
||||||
|
EventType.REDUCE_ATTEMPT_KILLED, EventType.TASK_FINISHED,
|
||||||
|
EventType.TASK_FAILED, EventType.JOB_FINISHED, EventType.JOB_FAILED,
|
||||||
|
EventType.JOB_KILLED).contains(historyEvent.getEventType())) {
|
||||||
|
numUnflushedCompletionEvents++;
|
||||||
|
if (!isTimerActive) {
|
||||||
|
resetFlushTimer();
|
||||||
|
if (!isTimerShutDown) {
|
||||||
|
flushTimerTask = new FlushTimerTask(this);
|
||||||
|
flushTimer.schedule(flushTimerTask, flushTimeout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void resetFlushTimer() throws IOException {
|
||||||
|
if (flushTimerTask != null) {
|
||||||
|
IOException exception = flushTimerTask.getException();
|
||||||
|
flushTimerTask.stop();
|
||||||
|
if (exception != null) {
|
||||||
|
throw exception;
|
||||||
|
}
|
||||||
|
flushTimerTask = null;
|
||||||
|
}
|
||||||
|
isTimerActive = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void maybeFlush(HistoryEvent historyEvent) throws IOException {
|
||||||
|
if ((eventQueue.size() < minQueueSizeForBatchingFlushes
|
||||||
|
&& numUnflushedCompletionEvents > 0)
|
||||||
|
|| numUnflushedCompletionEvents >= maxUnflushedCompletionEvents
|
||||||
|
|| isJobCompletionEvent(historyEvent)) {
|
||||||
|
this.flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void flush() throws IOException {
|
||||||
|
synchronized (lock) {
|
||||||
|
if (numUnflushedCompletionEvents != 0) { // skipped timer cancel.
|
||||||
writer.flush();
|
writer.flush();
|
||||||
|
numUnflushedCompletionEvents = 0;
|
||||||
|
resetFlushTimer();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void shutDownTimer() throws IOException {
|
||||||
|
synchronized (lock) {
|
||||||
|
isTimerShutDown = true;
|
||||||
|
flushTimer.cancel();
|
||||||
|
if (flushTimerTask != null && flushTimerTask.getException() != null) {
|
||||||
|
throw flushTimerTask.getException();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -682,7 +853,7 @@ public class JobHistoryEventHandler extends AbstractService
|
||||||
doneDirFS.delete(toPath, true);
|
doneDirFS.delete(toPath, true);
|
||||||
}
|
}
|
||||||
boolean copied = FileUtil.copy(stagingDirFS, fromPath, doneDirFS, toPath,
|
boolean copied = FileUtil.copy(stagingDirFS, fromPath, doneDirFS, toPath,
|
||||||
false, conf);
|
false, getConfig());
|
||||||
|
|
||||||
if (copied)
|
if (copied)
|
||||||
LOG.info("Copied to done location: " + toPath);
|
LOG.info("Copied to done location: " + toPath);
|
||||||
|
|
|
@ -0,0 +1,310 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.mapreduce.jobhistory;
|
||||||
|
|
||||||
|
import static org.mockito.Matchers.any;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.spy;
|
||||||
|
import static org.mockito.Mockito.times;
|
||||||
|
import static org.mockito.Mockito.verify;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileContext;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.mapreduce.Counters;
|
||||||
|
import org.apache.hadoop.mapreduce.MRJobConfig;
|
||||||
|
import org.apache.hadoop.mapreduce.TaskID;
|
||||||
|
import org.apache.hadoop.mapreduce.TaskType;
|
||||||
|
import org.apache.hadoop.mapreduce.TypeConverter;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.AppContext;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.app.job.Job;
|
||||||
|
import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils;
|
||||||
|
import org.apache.hadoop.yarn.YarnException;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
import org.apache.hadoop.yarn.util.BuilderUtils;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestJobHistoryEventHandler {
|
||||||
|
|
||||||
|
|
||||||
|
private static final Log LOG = LogFactory
|
||||||
|
.getLog(TestJobHistoryEventHandler.class);
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFirstFlushOnCompletionEvent() throws Exception {
|
||||||
|
TestParams t = new TestParams();
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set(MRJobConfig.MR_AM_STAGING_DIR, t.workDir);
|
||||||
|
conf.setLong(MRJobConfig.MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS,
|
||||||
|
60 * 1000l);
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, 10);
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, 10);
|
||||||
|
conf.setInt(
|
||||||
|
MRJobConfig.MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, 200);
|
||||||
|
|
||||||
|
JHEvenHandlerForTest realJheh =
|
||||||
|
new JHEvenHandlerForTest(t.mockAppContext, 0);
|
||||||
|
JHEvenHandlerForTest jheh = spy(realJheh);
|
||||||
|
jheh.init(conf);
|
||||||
|
|
||||||
|
EventWriter mockWriter = null;
|
||||||
|
try {
|
||||||
|
jheh.start();
|
||||||
|
handleEvent(jheh, new JobHistoryEvent(t.jobId, new AMStartedEvent(
|
||||||
|
t.appAttemptId, 200, t.containerId, "nmhost", 3000, 4000)));
|
||||||
|
mockWriter = jheh.getEventWriter();
|
||||||
|
verify(mockWriter).write(any(HistoryEvent.class));
|
||||||
|
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
queueEvent(jheh, new JobHistoryEvent(t.jobId, new TaskStartedEvent(
|
||||||
|
t.taskID, 0, TaskType.MAP, "")));
|
||||||
|
}
|
||||||
|
handleNextNEvents(jheh, 100);
|
||||||
|
verify(mockWriter, times(0)).flush();
|
||||||
|
|
||||||
|
// First completion event, but min-queue-size for batching flushes is 10
|
||||||
|
handleEvent(jheh, new JobHistoryEvent(t.jobId, new TaskFinishedEvent(
|
||||||
|
t.taskID, 0, TaskType.MAP, "", null)));
|
||||||
|
verify(mockWriter).flush();
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
jheh.stop();
|
||||||
|
verify(mockWriter).close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMaxUnflushedCompletionEvents() throws Exception {
|
||||||
|
TestParams t = new TestParams();
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set(MRJobConfig.MR_AM_STAGING_DIR, t.workDir);
|
||||||
|
conf.setLong(MRJobConfig.MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS,
|
||||||
|
60 * 1000l);
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, 10);
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, 10);
|
||||||
|
conf.setInt(
|
||||||
|
MRJobConfig.MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, 5);
|
||||||
|
|
||||||
|
JHEvenHandlerForTest realJheh =
|
||||||
|
new JHEvenHandlerForTest(t.mockAppContext, 0);
|
||||||
|
JHEvenHandlerForTest jheh = spy(realJheh);
|
||||||
|
jheh.init(conf);
|
||||||
|
|
||||||
|
EventWriter mockWriter = null;
|
||||||
|
try {
|
||||||
|
jheh.start();
|
||||||
|
handleEvent(jheh, new JobHistoryEvent(t.jobId, new AMStartedEvent(
|
||||||
|
t.appAttemptId, 200, t.containerId, "nmhost", 3000, 4000)));
|
||||||
|
mockWriter = jheh.getEventWriter();
|
||||||
|
verify(mockWriter).write(any(HistoryEvent.class));
|
||||||
|
|
||||||
|
for (int i = 0 ; i < 100 ; i++) {
|
||||||
|
queueEvent(jheh, new JobHistoryEvent(t.jobId, new TaskFinishedEvent(
|
||||||
|
t.taskID, 0, TaskType.MAP, "", null)));
|
||||||
|
}
|
||||||
|
|
||||||
|
handleNextNEvents(jheh, 9);
|
||||||
|
verify(mockWriter, times(0)).flush();
|
||||||
|
|
||||||
|
handleNextNEvents(jheh, 1);
|
||||||
|
verify(mockWriter).flush();
|
||||||
|
|
||||||
|
handleNextNEvents(jheh, 50);
|
||||||
|
verify(mockWriter, times(6)).flush();
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
jheh.stop();
|
||||||
|
verify(mockWriter).close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUnflushedTimer() throws Exception {
|
||||||
|
TestParams t = new TestParams();
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set(MRJobConfig.MR_AM_STAGING_DIR, t.workDir);
|
||||||
|
conf.setLong(MRJobConfig.MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS,
|
||||||
|
2 * 1000l); //2 seconds.
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, 10);
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, 100);
|
||||||
|
conf.setInt(
|
||||||
|
MRJobConfig.MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, 5);
|
||||||
|
|
||||||
|
JHEvenHandlerForTest realJheh =
|
||||||
|
new JHEvenHandlerForTest(t.mockAppContext, 0);
|
||||||
|
JHEvenHandlerForTest jheh = spy(realJheh);
|
||||||
|
jheh.init(conf);
|
||||||
|
|
||||||
|
EventWriter mockWriter = null;
|
||||||
|
try {
|
||||||
|
jheh.start();
|
||||||
|
handleEvent(jheh, new JobHistoryEvent(t.jobId, new AMStartedEvent(
|
||||||
|
t.appAttemptId, 200, t.containerId, "nmhost", 3000, 4000)));
|
||||||
|
mockWriter = jheh.getEventWriter();
|
||||||
|
verify(mockWriter).write(any(HistoryEvent.class));
|
||||||
|
|
||||||
|
for (int i = 0 ; i < 100 ; i++) {
|
||||||
|
queueEvent(jheh, new JobHistoryEvent(t.jobId, new TaskFinishedEvent(
|
||||||
|
t.taskID, 0, TaskType.MAP, "", null)));
|
||||||
|
}
|
||||||
|
|
||||||
|
handleNextNEvents(jheh, 9);
|
||||||
|
verify(mockWriter, times(0)).flush();
|
||||||
|
|
||||||
|
Thread.sleep(2 * 4 * 1000l); // 4 seconds should be enough. Just be safe.
|
||||||
|
verify(mockWriter).flush();
|
||||||
|
} finally {
|
||||||
|
jheh.stop();
|
||||||
|
verify(mockWriter).close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBatchedFlushJobEndMultiplier() throws Exception {
|
||||||
|
TestParams t = new TestParams();
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set(MRJobConfig.MR_AM_STAGING_DIR, t.workDir);
|
||||||
|
conf.setLong(MRJobConfig.MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS,
|
||||||
|
60 * 1000l); //2 seconds.
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, 3);
|
||||||
|
conf.setInt(MRJobConfig.MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, 10);
|
||||||
|
conf.setInt(
|
||||||
|
MRJobConfig.MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, 0);
|
||||||
|
|
||||||
|
JHEvenHandlerForTest realJheh =
|
||||||
|
new JHEvenHandlerForTest(t.mockAppContext, 0);
|
||||||
|
JHEvenHandlerForTest jheh = spy(realJheh);
|
||||||
|
jheh.init(conf);
|
||||||
|
|
||||||
|
EventWriter mockWriter = null;
|
||||||
|
try {
|
||||||
|
jheh.start();
|
||||||
|
handleEvent(jheh, new JobHistoryEvent(t.jobId, new AMStartedEvent(
|
||||||
|
t.appAttemptId, 200, t.containerId, "nmhost", 3000, 4000)));
|
||||||
|
mockWriter = jheh.getEventWriter();
|
||||||
|
verify(mockWriter).write(any(HistoryEvent.class));
|
||||||
|
|
||||||
|
for (int i = 0 ; i < 100 ; i++) {
|
||||||
|
queueEvent(jheh, new JobHistoryEvent(t.jobId, new TaskFinishedEvent(
|
||||||
|
t.taskID, 0, TaskType.MAP, "", null)));
|
||||||
|
}
|
||||||
|
queueEvent(jheh, new JobHistoryEvent(t.jobId, new JobFinishedEvent(
|
||||||
|
TypeConverter.fromYarn(t.jobId), 0, 10, 10, 0, 0, null, null, new Counters())));
|
||||||
|
|
||||||
|
handleNextNEvents(jheh, 29);
|
||||||
|
verify(mockWriter, times(0)).flush();
|
||||||
|
|
||||||
|
handleNextNEvents(jheh, 72);
|
||||||
|
verify(mockWriter, times(4)).flush(); //3 * 30 + 1 for JobFinished
|
||||||
|
} finally {
|
||||||
|
jheh.stop();
|
||||||
|
verify(mockWriter).close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void queueEvent(JHEvenHandlerForTest jheh, JobHistoryEvent event) {
|
||||||
|
jheh.handle(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleEvent(JHEvenHandlerForTest jheh, JobHistoryEvent event)
|
||||||
|
throws InterruptedException {
|
||||||
|
jheh.handle(event);
|
||||||
|
jheh.handleEvent(jheh.eventQueue.take());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void handleNextNEvents(JHEvenHandlerForTest jheh, int numEvents)
|
||||||
|
throws InterruptedException {
|
||||||
|
for (int i = 0; i < numEvents; i++) {
|
||||||
|
jheh.handleEvent(jheh.eventQueue.take());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String setupTestWorkDir() {
|
||||||
|
File testWorkDir = new File("target", this.getClass().getCanonicalName());
|
||||||
|
try {
|
||||||
|
FileContext.getLocalFSFileContext().delete(
|
||||||
|
new Path(testWorkDir.getAbsolutePath()), true);
|
||||||
|
return testWorkDir.getAbsolutePath();
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.warn("Could not cleanup", e);
|
||||||
|
throw new YarnException("could not cleanup test dir", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private AppContext mockAppContext(JobId jobId) {
|
||||||
|
AppContext mockContext = mock(AppContext.class);
|
||||||
|
Job mockJob = mock(Job.class);
|
||||||
|
when(mockJob.getTotalMaps()).thenReturn(10);
|
||||||
|
when(mockJob.getTotalReduces()).thenReturn(10);
|
||||||
|
when(mockJob.getName()).thenReturn("mockjob");
|
||||||
|
when(mockContext.getJob(jobId)).thenReturn(mockJob);
|
||||||
|
return mockContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private class TestParams {
|
||||||
|
String workDir = setupTestWorkDir();
|
||||||
|
ApplicationId appId = BuilderUtils.newApplicationId(200, 1);
|
||||||
|
ApplicationAttemptId appAttemptId =
|
||||||
|
BuilderUtils.newApplicationAttemptId(appId, 1);
|
||||||
|
ContainerId containerId = BuilderUtils.newContainerId(appAttemptId, 1);
|
||||||
|
TaskID taskID = TaskID.forName("task_200707121733_0003_m_000005");
|
||||||
|
JobId jobId = MRBuilderUtils.newJobId(appId, 1);
|
||||||
|
AppContext mockAppContext = mockAppContext(jobId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class JHEvenHandlerForTest extends JobHistoryEventHandler {
|
||||||
|
|
||||||
|
private EventWriter eventWriter;
|
||||||
|
volatile int handleEventCompleteCalls = 0;
|
||||||
|
volatile int handleEventStartedCalls = 0;
|
||||||
|
|
||||||
|
public JHEvenHandlerForTest(AppContext context, int startCount) {
|
||||||
|
super(context, startCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void start() {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected EventWriter createEventWriter(Path historyFilePath)
|
||||||
|
throws IOException {
|
||||||
|
this.eventWriter = mock(EventWriter.class);
|
||||||
|
return this.eventWriter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void closeEventWriter(JobId jobId) {
|
||||||
|
}
|
||||||
|
|
||||||
|
public EventWriter getEventWriter() {
|
||||||
|
return this.eventWriter;
|
||||||
|
}
|
||||||
|
}
|
|
@ -436,6 +436,26 @@ public interface MRJobConfig {
|
||||||
public static final String MR_AM_CREATE_JH_INTERMEDIATE_BASE_DIR =
|
public static final String MR_AM_CREATE_JH_INTERMEDIATE_BASE_DIR =
|
||||||
MR_AM_PREFIX + "create-intermediate-jh-base-dir";
|
MR_AM_PREFIX + "create-intermediate-jh-base-dir";
|
||||||
|
|
||||||
|
public static final String MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS =
|
||||||
|
MR_AM_PREFIX + "history.max-unflushed-events";
|
||||||
|
public static final int DEFAULT_MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS =
|
||||||
|
200;
|
||||||
|
|
||||||
|
public static final String MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER =
|
||||||
|
MR_AM_PREFIX + "history.job-complete-unflushed-multiplier";
|
||||||
|
public static final int DEFAULT_MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER =
|
||||||
|
30;
|
||||||
|
|
||||||
|
public static final String MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS =
|
||||||
|
MR_AM_PREFIX + "history.complete-event-flush-timeout";
|
||||||
|
public static final long DEFAULT_MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS =
|
||||||
|
30 * 1000l;
|
||||||
|
|
||||||
|
public static final String MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD =
|
||||||
|
MR_AM_PREFIX + "history.use-batched-flush.queue-size.threshold";
|
||||||
|
public static final int DEFAULT_MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD =
|
||||||
|
50;
|
||||||
|
|
||||||
public static final String MAPRED_MAP_ADMIN_JAVA_OPTS =
|
public static final String MAPRED_MAP_ADMIN_JAVA_OPTS =
|
||||||
"mapreduce.admin.map.child.java.opts";
|
"mapreduce.admin.map.child.java.opts";
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue