MAPREDUCE-4680. Job history cleaner should only check timestamps of files in old enough directories (Robert Kanter via Sandy Ryza)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1536558 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sanford Ryza 2013-10-28 23:48:38 +00:00
parent a2d81e3017
commit 84cec3c805
5 changed files with 224 additions and 5 deletions

View File

@ -214,6 +214,9 @@ Release 2.2.1 - UNRELEASED
OPTIMIZATIONS
MAPREDUCE-4680. Job history cleaner should only check timestamps of files in
old enough directories (Robert Kanter via Sandy Ryza)
BUG FIXES
MAPREDUCE-5569. FloatSplitter is not generating correct splits (Nathan

View File

@ -21,6 +21,7 @@ package org.apache.hadoop.mapreduce.v2.jobhistory;
import java.io.File;
import java.io.IOException;
import java.util.Calendar;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
@ -499,4 +500,72 @@ public class JobHistoryUtils {
return fc.makeQualified(JobHistoryUtils.getStagingJobHistoryFile(
histDirPath,jobId, (applicationAttemptId.getAttemptId() - 1)));
}
/**
* Looks for the dirs to clean. The folder structure is YYYY/MM/DD/Serial so
* we can use that to more efficiently find the directories to clean by
* comparing the cutoff timestamp with the timestamp from the folder
* structure.
*
* @param fc done dir FileContext
* @param root folder for completed jobs
* @param cutoff The cutoff for the max history age
* @return The list of directories for cleaning
* @throws IOException
*/
public static List<FileStatus> getHistoryDirsForCleaning(FileContext fc,
Path root, long cutoff) throws IOException {
List<FileStatus> fsList = new ArrayList<FileStatus>();
Calendar cCal = Calendar.getInstance();
cCal.setTimeInMillis(cutoff);
int cYear = cCal.get(Calendar.YEAR);
int cMonth = cCal.get(Calendar.MONTH) + 1;
int cDate = cCal.get(Calendar.DATE);
RemoteIterator<FileStatus> yearDirIt = fc.listStatus(root);
while (yearDirIt.hasNext()) {
FileStatus yearDir = yearDirIt.next();
try {
int year = Integer.parseInt(yearDir.getPath().getName());
if (year <= cYear) {
RemoteIterator<FileStatus> monthDirIt =
fc.listStatus(yearDir.getPath());
while (monthDirIt.hasNext()) {
FileStatus monthDir = monthDirIt.next();
try {
int month = Integer.parseInt(monthDir.getPath().getName());
// If we only checked the month here, then something like 07/2013
// would incorrectly not pass when the cutoff is 06/2014
if (year < cYear || month <= cMonth) {
RemoteIterator<FileStatus> dateDirIt =
fc.listStatus(monthDir.getPath());
while (dateDirIt.hasNext()) {
FileStatus dateDir = dateDirIt.next();
try {
int date = Integer.parseInt(dateDir.getPath().getName());
// If we only checked the date here, then something like
// 07/21/2013 would incorrectly not pass when the cutoff is
// 08/20/2013 or 07/20/2012
if (year < cYear || month < cMonth || date <= cDate) {
fsList.addAll(remoteIterToList(
fc.listStatus(dateDir.getPath())));
}
} catch (NumberFormatException nfe) {
// the directory didn't fit the format we're looking for so
// skip the dir
}
}
}
} catch (NumberFormatException nfe) {
// the directory didn't fit the format we're looking for so skip
// the dir
}
}
}
} catch (NumberFormatException nfe) {
// the directory didn't fit the format we're looking for so skip the dir
}
}
return fsList;
}
}

View File

@ -0,0 +1,143 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.v2.jobhistory;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.junit.Assert;
import org.junit.Test;
public class TestJobHistoryUtils {
final static String TEST_DIR = new File(System.getProperty("test.build.data"))
.getAbsolutePath();
@Test
@SuppressWarnings("unchecked")
public void testGetHistoryDirsForCleaning() throws IOException {
Path pRoot = new Path(TEST_DIR, "org.apache.hadoop.mapreduce.v2.jobhistory."
+ "TestJobHistoryUtils.testGetHistoryDirsForCleaning");
FileContext fc = FileContext.getFileContext();
Calendar cCal = Calendar.getInstance();
int year = 2013;
int month = 7;
int day = 21;
cCal.set(year, month - 1, day, 1, 0);
long cutoff = cCal.getTimeInMillis();
clearDir(fc, pRoot);
Path pId00 = createPath(fc, pRoot, year, month, day, "000000");
Path pId01 = createPath(fc, pRoot, year, month, day+1, "000001");
Path pId02 = createPath(fc, pRoot, year, month, day-1, "000002");
Path pId03 = createPath(fc, pRoot, year, month+1, day, "000003");
Path pId04 = createPath(fc, pRoot, year, month+1, day+1, "000004");
Path pId05 = createPath(fc, pRoot, year, month+1, day-1, "000005");
Path pId06 = createPath(fc, pRoot, year, month-1, day, "000006");
Path pId07 = createPath(fc, pRoot, year, month-1, day+1, "000007");
Path pId08 = createPath(fc, pRoot, year, month-1, day-1, "000008");
Path pId09 = createPath(fc, pRoot, year+1, month, day, "000009");
Path pId10 = createPath(fc, pRoot, year+1, month, day+1, "000010");
Path pId11 = createPath(fc, pRoot, year+1, month, day-1, "000011");
Path pId12 = createPath(fc, pRoot, year+1, month+1, day, "000012");
Path pId13 = createPath(fc, pRoot, year+1, month+1, day+1, "000013");
Path pId14 = createPath(fc, pRoot, year+1, month+1, day-1, "000014");
Path pId15 = createPath(fc, pRoot, year+1, month-1, day, "000015");
Path pId16 = createPath(fc, pRoot, year+1, month-1, day+1, "000016");
Path pId17 = createPath(fc, pRoot, year+1, month-1, day-1, "000017");
Path pId18 = createPath(fc, pRoot, year-1, month, day, "000018");
Path pId19 = createPath(fc, pRoot, year-1, month, day+1, "000019");
Path pId20 = createPath(fc, pRoot, year-1, month, day-1, "000020");
Path pId21 = createPath(fc, pRoot, year-1, month+1, day, "000021");
Path pId22 = createPath(fc, pRoot, year-1, month+1, day+1, "000022");
Path pId23 = createPath(fc, pRoot, year-1, month+1, day-1, "000023");
Path pId24 = createPath(fc, pRoot, year-1, month-1, day, "000024");
Path pId25 = createPath(fc, pRoot, year-1, month-1, day+1, "000025");
Path pId26 = createPath(fc, pRoot, year-1, month-1, day-1, "000026");
// non-expected names should be ignored without problems
Path pId27 = createPath(fc, pRoot, "foo", "" + month, "" + day, "000027");
Path pId28 = createPath(fc, pRoot, "" + year, "foo", "" + day, "000028");
Path pId29 = createPath(fc, pRoot, "" + year, "" + month, "foo", "000029");
List<FileStatus> dirs = JobHistoryUtils
.getHistoryDirsForCleaning(fc, pRoot, cutoff);
Collections.sort(dirs);
Assert.assertEquals(14, dirs.size());
Assert.assertEquals(pId26.toUri().getPath(),
dirs.get(0).getPath().toUri().getPath());
Assert.assertEquals(pId24.toUri().getPath(),
dirs.get(1).getPath().toUri().getPath());
Assert.assertEquals(pId25.toUri().getPath(),
dirs.get(2).getPath().toUri().getPath());
Assert.assertEquals(pId20.toUri().getPath(),
dirs.get(3).getPath().toUri().getPath());
Assert.assertEquals(pId18.toUri().getPath(),
dirs.get(4).getPath().toUri().getPath());
Assert.assertEquals(pId19.toUri().getPath(),
dirs.get(5).getPath().toUri().getPath());
Assert.assertEquals(pId23.toUri().getPath(),
dirs.get(6).getPath().toUri().getPath());
Assert.assertEquals(pId21.toUri().getPath(),
dirs.get(7).getPath().toUri().getPath());
Assert.assertEquals(pId22.toUri().getPath(),
dirs.get(8).getPath().toUri().getPath());
Assert.assertEquals(pId08.toUri().getPath(),
dirs.get(9).getPath().toUri().getPath());
Assert.assertEquals(pId06.toUri().getPath(),
dirs.get(10).getPath().toUri().getPath());
Assert.assertEquals(pId07.toUri().getPath(),
dirs.get(11).getPath().toUri().getPath());
Assert.assertEquals(pId02.toUri().getPath(),
dirs.get(12).getPath().toUri().getPath());
Assert.assertEquals(pId00.toUri().getPath(),
dirs.get(13).getPath().toUri().getPath());
}
private void clearDir(FileContext fc, Path p) throws IOException {
try {
fc.delete(p, true);
} catch (FileNotFoundException e) {
// ignore
}
fc.mkdir(p, FsPermission.getDirDefault(), false);
}
private Path createPath(FileContext fc, Path root, int year, int month,
int day, String id) throws IOException {
Path path = new Path(root, year + Path.SEPARATOR + month + Path.SEPARATOR +
day + Path.SEPARATOR + id);
fc.mkdir(path, FsPermission.getDirDefault(), true);
return path;
}
private Path createPath(FileContext fc, Path root, String year, String month,
String day, String id) throws IOException {
Path path = new Path(root, year + Path.SEPARATOR + month + Path.SEPARATOR +
day + Path.SEPARATOR + id);
fc.mkdir(path, FsPermission.getDirDefault(), true);
return path;
}
}

View File

@ -924,6 +924,11 @@ public class HistoryFileManager extends AbstractService {
fileInfo.delete();
}
List<FileStatus> getHistoryDirsForCleaning(long cutoff) throws IOException {
return JobHistoryUtils.
getHistoryDirsForCleaning(doneDirFc, doneDirPrefixPath, cutoff);
}
/**
* Clean up older history files.
*
@ -932,12 +937,9 @@ public class HistoryFileManager extends AbstractService {
*/
@SuppressWarnings("unchecked")
void clean() throws IOException {
// TODO this should be replaced by something that knows about the directory
// structure and will put less of a load on HDFS.
long cutoff = System.currentTimeMillis() - maxHistoryAge;
boolean halted = false;
// TODO Delete YYYY/MM/DD directories.
List<FileStatus> serialDirList = findTimestampedDirectories();
List<FileStatus> serialDirList = getHistoryDirsForCleaning(cutoff);
// Sort in ascending order. Relies on YYYY/MM/DD/Serial
Collections.sort(serialDirList);
for (FileStatus serialDir : serialDirList) {

View File

@ -37,6 +37,7 @@ import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
import org.junit.After;
import org.junit.Test;
import org.mockito.Mockito;
import static org.junit.Assert.assertEquals;
import static org.mockito.Mockito.*;
@ -175,7 +176,8 @@ public class TestJobHistory {
doReturn(list2).when(historyManager).scanDirectoryForHistoryFiles(
eq(donePathToday), any(FileContext.class));
doReturn(fileStatusList).when(historyManager).findTimestampedDirectories();
doReturn(fileStatusList).when(historyManager)
.getHistoryDirsForCleaning(Mockito.anyLong());
doReturn(true).when(historyManager).deleteDir(any(FileStatus.class));
JobListCache jobListCache = mock(JobListCache.class);