From 31a3a99f056c2f058ae41b1464c38309e7393e1e Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Thu, 20 Mar 2014 02:48:48 +0000 Subject: [PATCH] MAPREDUCE-2349. Modified FileInputFormat to be able to issue file and block location calls in parallel. Contributed by Siddharth Seth. svn merge --ignore-ancestry -c 1579515 ../../trunk/ git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1579516 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 3 + .../apache/hadoop/mapred/FileInputFormat.java | 54 ++- .../mapred/LocatedFileStatusFetcher.java | 371 ++++++++++++++++++ .../mapreduce/lib/input/FileInputFormat.java | 51 ++- .../src/main/resources/mapred-default.xml | 9 + .../hadoop/mapred/TestFileInputFormat.java | 126 +++++- .../lib/input/TestFileInputFormat.java | 296 +++++++++++++- 7 files changed, 877 insertions(+), 33 deletions(-) create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LocatedFileStatusFetcher.java diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index f6ee2049630..767dc16c28b 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -57,6 +57,9 @@ Release 2.4.0 - UNRELEASED MAPREDUCE-4052. Improved MapReduce clients to use NodeManagers' ability to handle cross platform application submissions. (Jian He via vinodkv) + MAPREDUCE-2349. Modified FileInputFormat to be able to issue file and block + location calls in parallel. (Siddharth Seth via vinodkv) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java index bb43e20cbee..9863427076e 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java @@ -47,6 +47,9 @@ import org.apache.hadoop.net.NodeBase; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; +import com.google.common.base.Stopwatch; +import com.google.common.collect.Iterables; + /** * A base class for file-based {@link InputFormat}. * @@ -203,10 +206,7 @@ public abstract class FileInputFormat implements InputFormat { // Whether we need to recursive look into the directory structure boolean recursive = job.getBoolean(INPUT_DIR_RECURSIVE, false); - - List result = new ArrayList(); - List errors = new ArrayList(); - + // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List filters = new ArrayList(); @@ -217,6 +217,41 @@ public abstract class FileInputFormat implements InputFormat { } PathFilter inputFilter = new MultiPathFilter(filters); + FileStatus[] result; + int numThreads = job + .getInt( + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.LIST_STATUS_NUM_THREADS, + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS); + + Stopwatch sw = new Stopwatch().start(); + if (numThreads == 1) { + List locatedFiles = singleThreadedListStatus(job, dirs, inputFilter, recursive); + result = locatedFiles.toArray(new FileStatus[locatedFiles.size()]); + } else { + Iterable locatedFiles = null; + try { + + LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( + job, dirs, recursive, inputFilter, false); + locatedFiles = locatedFileStatusFetcher.getFileStatuses(); + } catch (InterruptedException e) { + throw new IOException("Interrupted while getting file statuses"); + } + result = Iterables.toArray(locatedFiles, FileStatus.class); + } + + sw.stop(); + if (LOG.isDebugEnabled()) { + LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis()); + } + LOG.info("Total input paths to process : " + result.length); + return result; + } + + private List singleThreadedListStatus(JobConf job, Path[] dirs, + PathFilter inputFilter, boolean recursive) throws IOException { + List result = new ArrayList(); + List errors = new ArrayList(); for (Path p: dirs) { FileSystem fs = p.getFileSystem(job); FileStatus[] matches = fs.globStatus(p, inputFilter); @@ -246,12 +281,10 @@ public abstract class FileInputFormat implements InputFormat { } } } - if (!errors.isEmpty()) { throw new InvalidInputException(errors); } - LOG.info("Total input paths to process : " + result.size()); - return result.toArray(new FileStatus[result.size()]); + return result; } /** @@ -267,6 +300,7 @@ public abstract class FileInputFormat implements InputFormat { * they're too big.*/ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { + Stopwatch sw = new Stopwatch().start(); FileStatus[] files = listStatus(job); // Save the number of input files for metrics/loadgen @@ -325,7 +359,11 @@ public abstract class FileInputFormat implements InputFormat { splits.add(makeSplit(path, 0, length, new String[0])); } } - LOG.debug("Total # of splits: " + splits.size()); + sw.stop(); + if (LOG.isDebugEnabled()) { + LOG.debug("Total # of splits generated by getSplits: " + splits.size() + + ", TimeTaken: " + sw.elapsedMillis()); + } return splits.toArray(new FileSplit[splits.size()]); } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LocatedFileStatusFetcher.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LocatedFileStatusFetcher.java new file mode 100644 index 00000000000..87114ad04f4 --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LocatedFileStatusFetcher.java @@ -0,0 +1,371 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapred; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.classification.InterfaceAudience.Private; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; + +import com.google.common.collect.Iterables; +import com.google.common.util.concurrent.FutureCallback; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListeningExecutorService; +import com.google.common.util.concurrent.MoreExecutors; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +/** + * Utility class to fetch block locations for specified Input paths using a + * configured number of threads. + */ +@Private +public class LocatedFileStatusFetcher { + + private final Path[] inputDirs; + private final PathFilter inputFilter; + private final Configuration conf; + private final boolean recursive; + private final boolean newApi; + + private final ExecutorService rawExec; + private final ListeningExecutorService exec; + private final BlockingQueue> resultQueue; + private final List invalidInputErrors = new LinkedList(); + + private final ProcessInitialInputPathCallback processInitialInputPathCallback = + new ProcessInitialInputPathCallback(); + private final ProcessInputDirCallback processInputDirCallback = + new ProcessInputDirCallback(); + + private final AtomicInteger runningTasks = new AtomicInteger(0); + + private final ReentrantLock lock = new ReentrantLock(); + private final Condition condition = lock.newCondition(); + + private volatile Throwable unknownError; + + /** + * @param conf configuration for the job + * @param dirs the initial list of paths + * @param recursive whether to traverse the patchs recursively + * @param inputFilter inputFilter to apply to the resulting paths + * @param newApi whether using the mapred or mapreduce API + * @throws InterruptedException + * @throws IOException + */ + public LocatedFileStatusFetcher(Configuration conf, Path[] dirs, + boolean recursive, PathFilter inputFilter, boolean newApi) throws InterruptedException, + IOException { + int numThreads = conf.getInt(FileInputFormat.LIST_STATUS_NUM_THREADS, + FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS); + rawExec = Executors.newFixedThreadPool( + numThreads, + new ThreadFactoryBuilder().setDaemon(true) + .setNameFormat("GetFileInfo #%d").build()); + exec = MoreExecutors.listeningDecorator(rawExec); + resultQueue = new LinkedBlockingQueue>(); + this.conf = conf; + this.inputDirs = dirs; + this.recursive = recursive; + this.inputFilter = inputFilter; + this.newApi = newApi; + } + + /** + * Start executing and return FileStatuses based on the parameters specified + * @return fetched file statuses + * @throws InterruptedException + * @throws IOException + */ + public Iterable getFileStatuses() throws InterruptedException, + IOException { + // Increment to make sure a race between the first thread completing and the + // rest being scheduled does not lead to a termination. + runningTasks.incrementAndGet(); + for (Path p : inputDirs) { + runningTasks.incrementAndGet(); + ListenableFuture future = exec + .submit(new ProcessInitialInputPathCallable(p, conf, inputFilter)); + Futures.addCallback(future, processInitialInputPathCallback); + } + + runningTasks.decrementAndGet(); + + lock.lock(); + try { + while (runningTasks.get() != 0 && unknownError == null) { + condition.await(); + } + } finally { + lock.unlock(); + } + this.exec.shutdownNow(); + if (this.unknownError != null) { + if (this.unknownError instanceof Error) { + throw (Error) this.unknownError; + } else if (this.unknownError instanceof RuntimeException) { + throw (RuntimeException) this.unknownError; + } else if (this.unknownError instanceof IOException) { + throw (IOException) this.unknownError; + } else if (this.unknownError instanceof InterruptedException) { + throw (InterruptedException) this.unknownError; + } else { + throw new IOException(this.unknownError); + } + } + if (this.invalidInputErrors.size() != 0) { + if (this.newApi) { + throw new org.apache.hadoop.mapreduce.lib.input.InvalidInputException( + invalidInputErrors); + } else { + throw new InvalidInputException(invalidInputErrors); + } + } + return Iterables.concat(resultQueue); + } + + /** + * Collect misconfigured Input errors. Errors while actually reading file info + * are reported immediately + */ + private void registerInvalidInputError(List errors) { + synchronized (this) { + this.invalidInputErrors.addAll(errors); + } + } + + /** + * Register fatal errors - example an IOException while accessing a file or a + * full exection queue + */ + private void registerError(Throwable t) { + lock.lock(); + try { + if (unknownError != null) { + unknownError = t; + condition.signal(); + } + + } finally { + lock.unlock(); + } + } + + private void decrementRunningAndCheckCompletion() { + lock.lock(); + try { + if (runningTasks.decrementAndGet() == 0) { + condition.signal(); + } + } finally { + lock.unlock(); + } + } + + /** + * Retrieves block locations for the given @link {@link FileStatus}, and adds + * additional paths to the process queue if required. + */ + private static class ProcessInputDirCallable implements + Callable { + + private final FileSystem fs; + private final FileStatus fileStatus; + private final boolean recursive; + private final PathFilter inputFilter; + + ProcessInputDirCallable(FileSystem fs, FileStatus fileStatus, + boolean recursive, PathFilter inputFilter) { + this.fs = fs; + this.fileStatus = fileStatus; + this.recursive = recursive; + this.inputFilter = inputFilter; + } + + @Override + public Result call() throws Exception { + Result result = new Result(); + result.fs = fs; + + if (fileStatus.isDirectory()) { + RemoteIterator iter = fs + .listLocatedStatus(fileStatus.getPath()); + while (iter.hasNext()) { + LocatedFileStatus stat = iter.next(); + if (inputFilter.accept(stat.getPath())) { + if (recursive && stat.isDirectory()) { + result.dirsNeedingRecursiveCalls.add(stat); + } else { + result.locatedFileStatuses.add(stat); + } + } + } + } else { + result.locatedFileStatuses.add(fileStatus); + } + return result; + } + + private static class Result { + private List locatedFileStatuses = new LinkedList(); + private List dirsNeedingRecursiveCalls = new LinkedList(); + private FileSystem fs; + } + } + + /** + * The callback handler to handle results generated by + * {@link ProcessInputDirCallable}. This populates the final result set. + * + */ + private class ProcessInputDirCallback implements + FutureCallback { + + @Override + public void onSuccess(ProcessInputDirCallable.Result result) { + try { + if (result.locatedFileStatuses.size() != 0) { + resultQueue.add(result.locatedFileStatuses); + } + if (result.dirsNeedingRecursiveCalls.size() != 0) { + for (FileStatus fileStatus : result.dirsNeedingRecursiveCalls) { + runningTasks.incrementAndGet(); + ListenableFuture future = exec + .submit(new ProcessInputDirCallable(result.fs, fileStatus, + recursive, inputFilter)); + Futures.addCallback(future, processInputDirCallback); + } + } + decrementRunningAndCheckCompletion(); + } catch (Throwable t) { // Error within the callback itself. + registerError(t); + } + } + + @Override + public void onFailure(Throwable t) { + // Any generated exceptions. Leads to immediate termination. + registerError(t); + } + } + + + /** + * Processes an initial Input Path pattern through the globber and PathFilter + * to generate a list of files which need further processing. + */ + private static class ProcessInitialInputPathCallable implements + Callable { + + private final Path path; + private final Configuration conf; + private final PathFilter inputFilter; + + public ProcessInitialInputPathCallable(Path path, Configuration conf, + PathFilter pathFilter) { + this.path = path; + this.conf = conf; + this.inputFilter = pathFilter; + } + + @Override + public Result call() throws Exception { + Result result = new Result(); + FileSystem fs = path.getFileSystem(conf); + result.fs = fs; + FileStatus[] matches = fs.globStatus(path, inputFilter); + if (matches == null) { + result.addError(new IOException("Input path does not exist: " + path)); + } else if (matches.length == 0) { + result.addError(new IOException("Input Pattern " + path + + " matches 0 files")); + } else { + result.matchedFileStatuses = matches; + } + return result; + } + + private static class Result { + private List errors; + private FileStatus[] matchedFileStatuses; + private FileSystem fs; + + void addError(IOException ioe) { + if (errors == null) { + errors = new LinkedList(); + } + errors.add(ioe); + } + } + } + + /** + * The callback handler to handle results generated by + * {@link ProcessInitialInputPathCallable} + * + */ + private class ProcessInitialInputPathCallback implements + FutureCallback { + + @Override + public void onSuccess(ProcessInitialInputPathCallable.Result result) { + try { + if (result.errors != null) { + registerInvalidInputError(result.errors); + } + if (result.matchedFileStatuses != null) { + for (FileStatus matched : result.matchedFileStatuses) { + runningTasks.incrementAndGet(); + ListenableFuture future = exec + .submit(new ProcessInputDirCallable(result.fs, matched, + recursive, inputFilter)); + Futures.addCallback(future, processInputDirCallback); + } + } + decrementRunningAndCheckCompletion(); + } catch (Throwable t) { // Exception within the callback + registerError(t); + } + } + + @Override + public void onFailure(Throwable t) { + // Any generated exceptions. Leads to immediate termination. + registerError(t); + } + } +} \ No newline at end of file diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java index e46703d7f9c..5f32f11ca0c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.mapred.LocatedFileStatusFetcher; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; @@ -43,6 +44,9 @@ import org.apache.hadoop.mapreduce.security.TokenCache; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; +import com.google.common.base.Stopwatch; +import com.google.common.collect.Lists; + /** * A base class for file-based {@link InputFormat}s. * @@ -68,6 +72,9 @@ public abstract class FileInputFormat extends InputFormat { "mapreduce.input.fileinputformat.numinputfiles"; public static final String INPUT_DIR_RECURSIVE = "mapreduce.input.fileinputformat.input.dir.recursive"; + public static final String LIST_STATUS_NUM_THREADS = + "mapreduce.input.fileinputformat.list-status.num-threads"; + public static final int DEFAULT_LIST_STATUS_NUM_THREADS = 1; private static final Log LOG = LogFactory.getLog(FileInputFormat.class); @@ -225,7 +232,6 @@ public abstract class FileInputFormat extends InputFormat { */ protected List listStatus(JobContext job ) throws IOException { - List result = new ArrayList(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); @@ -237,9 +243,7 @@ public abstract class FileInputFormat extends InputFormat { // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); - - List errors = new ArrayList(); - + // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List filters = new ArrayList(); @@ -250,6 +254,37 @@ public abstract class FileInputFormat extends InputFormat { } PathFilter inputFilter = new MultiPathFilter(filters); + List result = null; + + int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, + DEFAULT_LIST_STATUS_NUM_THREADS); + Stopwatch sw = new Stopwatch().start(); + if (numThreads == 1) { + result = singleThreadedListStatus(job, dirs, inputFilter, recursive); + } else { + Iterable locatedFiles = null; + try { + LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( + job.getConfiguration(), dirs, recursive, inputFilter, true); + locatedFiles = locatedFileStatusFetcher.getFileStatuses(); + } catch (InterruptedException e) { + throw new IOException("Interrupted while getting file statuses"); + } + result = Lists.newArrayList(locatedFiles); + } + + sw.stop(); + if (LOG.isDebugEnabled()) { + LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis()); + } + LOG.info("Total input paths to process : " + result.size()); + return result; + } + + private List singleThreadedListStatus(JobContext job, Path[] dirs, + PathFilter inputFilter, boolean recursive) throws IOException { + List result = new ArrayList(); + List errors = new ArrayList(); for (int i=0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); @@ -284,7 +319,6 @@ public abstract class FileInputFormat extends InputFormat { if (!errors.isEmpty()) { throw new InvalidInputException(errors); } - LOG.info("Total input paths to process : " + result.size()); return result; } @@ -332,6 +366,7 @@ public abstract class FileInputFormat extends InputFormat { * @throws IOException */ public List getSplits(JobContext job) throws IOException { + Stopwatch sw = new Stopwatch().start(); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); @@ -376,7 +411,11 @@ public abstract class FileInputFormat extends InputFormat { } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); - LOG.debug("Total # of splits: " + splits.size()); + sw.stop(); + if (LOG.isDebugEnabled()) { + LOG.debug("Total # of splits generated by getSplits: " + splits.size() + + ", TimeTaken: " + sw.elapsedMillis()); + } return splits; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml index 465a446e4bb..2c260e1a669 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml @@ -694,6 +694,15 @@ take priority over this setting. + + mapreduce.input.fileinputformat.list-status.num-threads + 1 + The number of threads to use to list and fetch block locations + for the specified input paths. Note: multiple threads should not be used + if a custom non thread-safe path filter is used. + + + mapreduce.jobtracker.maxtasks.perjob -1 diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java index 876b718acf7..0bb4e96470f 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java @@ -19,7 +19,12 @@ package org.apache.hadoop.mapred; import java.io.FileNotFoundException; import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileStatus; @@ -29,15 +34,58 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.RawLocalFileSystem; import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.junit.After; import org.junit.Assert; +import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; +import com.google.common.collect.Lists; + +@RunWith(value = Parameterized.class) public class TestFileInputFormat { - + + private static final Log LOG = LogFactory.getLog(TestFileInputFormat.class); + + private static String testTmpDir = System.getProperty("test.build.data", "/tmp"); + private static final Path TEST_ROOT_DIR = new Path(testTmpDir, "TestFIF"); + + private static FileSystem localFs; + + private int numThreads; + + public TestFileInputFormat(int numThreads) { + this.numThreads = numThreads; + LOG.info("Running with numThreads: " + numThreads); + } + + @Parameters + public static Collection data() { + Object[][] data = new Object[][] { { 1 }, { 5 }}; + return Arrays.asList(data); + } + + @Before + public void setup() throws IOException { + LOG.info("Using Test Dir: " + TEST_ROOT_DIR); + localFs = FileSystem.getLocal(new Configuration()); + localFs.delete(TEST_ROOT_DIR, true); + localFs.mkdirs(TEST_ROOT_DIR); + } + + @After + public void cleanup() throws IOException { + localFs.delete(TEST_ROOT_DIR, true); + } + @Test public void testListLocatedStatus() throws Exception { Configuration conf = getConfiguration(); conf.setBoolean("fs.test.impl.disable.cache", false); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, "test:///a1/a2"); MockFileSystem mockFs = @@ -51,6 +99,82 @@ public class TestFileInputFormat { Assert.assertEquals("Input splits are not correct", 2, splits.length); Assert.assertEquals("listLocatedStatuss calls", 1, mockFs.numListLocatedStatusCalls); + FileSystem.closeAll(); + } + + @Test + public void testListStatusSimple() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + List expectedPaths = org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat + .configureTestSimple(conf, localFs); + + JobConf jobConf = new JobConf(conf); + TextInputFormat fif = new TextInputFormat(); + fif.configure(jobConf); + FileStatus[] statuses = fif.listStatus(jobConf); + + org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat + .verifyFileStatuses(expectedPaths, Lists.newArrayList(statuses), + localFs); + } + + @Test + public void testListStatusNestedRecursive() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + List expectedPaths = org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat + .configureTestNestedRecursive(conf, localFs); + JobConf jobConf = new JobConf(conf); + TextInputFormat fif = new TextInputFormat(); + fif.configure(jobConf); + FileStatus[] statuses = fif.listStatus(jobConf); + + org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat + .verifyFileStatuses(expectedPaths, Lists.newArrayList(statuses), + localFs); + } + + @Test + public void testListStatusNestedNonRecursive() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + List expectedPaths = org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat + .configureTestNestedNonRecursive(conf, localFs); + JobConf jobConf = new JobConf(conf); + TextInputFormat fif = new TextInputFormat(); + fif.configure(jobConf); + FileStatus[] statuses = fif.listStatus(jobConf); + + org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat + .verifyFileStatuses(expectedPaths, Lists.newArrayList(statuses), + localFs); + } + + @Test + public void testListStatusErrorOnNonExistantDir() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat + .configureTestErrorOnNonExistantDir(conf, localFs); + JobConf jobConf = new JobConf(conf); + TextInputFormat fif = new TextInputFormat(); + fif.configure(jobConf); + try { + fif.listStatus(jobConf); + Assert.fail("Expecting an IOException for a missing Input path"); + } catch (IOException e) { + Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2"); + expectedExceptionPath = localFs.makeQualified(expectedExceptionPath); + Assert.assertTrue(e instanceof InvalidInputException); + Assert.assertEquals( + "Input path does not exist: " + expectedExceptionPath.toString(), + e.getMessage()); + } } private Configuration getConfiguration() { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java index 77445596d93..1999f8b962c 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java @@ -19,10 +19,17 @@ package org.apache.hadoop.mapreduce.lib.input; import java.io.FileNotFoundException; import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; import java.util.List; +import java.util.Set; + +import javax.annotation.Nullable; import junit.framework.Assert; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileStatus; @@ -34,55 +41,90 @@ import org.apache.hadoop.fs.RawLocalFileSystem; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; +import org.junit.After; +import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; +import com.google.common.base.Function; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +@RunWith(value = Parameterized.class) public class TestFileInputFormat { + + private static final Log LOG = LogFactory.getLog(TestFileInputFormat.class); + + private static String testTmpDir = System.getProperty("test.build.data", "/tmp"); + private static final Path TEST_ROOT_DIR = new Path(testTmpDir, "TestFIF"); + + private static FileSystem localFs; + + private int numThreads; + + public TestFileInputFormat(int numThreads) { + this.numThreads = numThreads; + LOG.info("Running with numThreads: " + numThreads); + } + + @Parameters + public static Collection data() { + Object[][] data = new Object[][] { { 1 }, { 5 }}; + return Arrays.asList(data); + } + + @Before + public void setup() throws IOException { + LOG.info("Using Test Dir: " + TEST_ROOT_DIR); + localFs = FileSystem.getLocal(new Configuration()); + localFs.delete(TEST_ROOT_DIR, true); + localFs.mkdirs(TEST_ROOT_DIR); + } + + @After + public void cleanup() throws IOException { + localFs.delete(TEST_ROOT_DIR, true); + } @Test public void testNumInputFilesRecursively() throws Exception { Configuration conf = getConfiguration(); conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true"); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); Job job = Job.getInstance(conf); FileInputFormat fileInputFormat = new TextInputFormat(); List splits = fileInputFormat.getSplits(job); Assert.assertEquals("Input splits are not correct", 3, splits.size()); - Assert.assertEquals("test:/a1/a2/file2", ((FileSplit) splits.get(0)) - .getPath().toString()); - Assert.assertEquals("test:/a1/a2/file3", ((FileSplit) splits.get(1)) - .getPath().toString()); - Assert.assertEquals("test:/a1/file1", ((FileSplit) splits.get(2)).getPath() - .toString()); - + verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", + "test:/a1/file1"), splits); + // Using the deprecated configuration conf = getConfiguration(); conf.set("mapred.input.dir.recursive", "true"); job = Job.getInstance(conf); splits = fileInputFormat.getSplits(job); - Assert.assertEquals("Input splits are not correct", 3, splits.size()); - Assert.assertEquals("test:/a1/a2/file2", ((FileSplit) splits.get(0)) - .getPath().toString()); - Assert.assertEquals("test:/a1/a2/file3", ((FileSplit) splits.get(1)) - .getPath().toString()); - Assert.assertEquals("test:/a1/file1", ((FileSplit) splits.get(2)).getPath() - .toString()); + verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", + "test:/a1/file1"), splits); } @Test public void testNumInputFilesWithoutRecursively() throws Exception { Configuration conf = getConfiguration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); Job job = Job.getInstance(conf); FileInputFormat fileInputFormat = new TextInputFormat(); List splits = fileInputFormat.getSplits(job); Assert.assertEquals("Input splits are not correct", 2, splits.size()); - Assert.assertEquals("test:/a1/a2", ((FileSplit) splits.get(0)).getPath() - .toString()); - Assert.assertEquals("test:/a1/file1", ((FileSplit) splits.get(1)).getPath() - .toString()); + verifySplits(Lists.newArrayList("test:/a1/a2", "test:/a1/file1"), splits); } @Test public void testListLocatedStatus() throws Exception { Configuration conf = getConfiguration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); conf.setBoolean("fs.test.impl.disable.cache", false); conf.set(FileInputFormat.INPUT_DIR, "test:///a1/a2"); MockFileSystem mockFs = @@ -95,8 +137,226 @@ public class TestFileInputFormat { Assert.assertEquals("Input splits are not correct", 2, splits.size()); Assert.assertEquals("listLocatedStatuss calls", 1, mockFs.numListLocatedStatusCalls); + FileSystem.closeAll(); } + @Test + public void testListStatusSimple() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + List expectedPaths = configureTestSimple(conf, localFs); + + Job job = Job.getInstance(conf); + FileInputFormat fif = new TextInputFormat(); + List statuses = fif.listStatus(job); + + verifyFileStatuses(expectedPaths, statuses, localFs); + } + + @Test + public void testListStatusNestedRecursive() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + List expectedPaths = configureTestNestedRecursive(conf, localFs); + Job job = Job.getInstance(conf); + FileInputFormat fif = new TextInputFormat(); + List statuses = fif.listStatus(job); + + verifyFileStatuses(expectedPaths, statuses, localFs); + } + + + @Test + public void testListStatusNestedNonRecursive() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + List expectedPaths = configureTestNestedNonRecursive(conf, localFs); + Job job = Job.getInstance(conf); + FileInputFormat fif = new TextInputFormat(); + List statuses = fif.listStatus(job); + + verifyFileStatuses(expectedPaths, statuses, localFs); + } + + @Test + public void testListStatusErrorOnNonExistantDir() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); + + configureTestErrorOnNonExistantDir(conf, localFs); + Job job = Job.getInstance(conf); + FileInputFormat fif = new TextInputFormat(); + try { + fif.listStatus(job); + Assert.fail("Expecting an IOException for a missing Input path"); + } catch (IOException e) { + Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2"); + expectedExceptionPath = localFs.makeQualified(expectedExceptionPath); + Assert.assertTrue(e instanceof InvalidInputException); + Assert.assertEquals( + "Input path does not exist: " + expectedExceptionPath.toString(), + e.getMessage()); + } + } + + public static List configureTestSimple(Configuration conf, FileSystem localFs) + throws IOException { + Path base1 = new Path(TEST_ROOT_DIR, "input1"); + Path base2 = new Path(TEST_ROOT_DIR, "input2"); + conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, + localFs.makeQualified(base1) + "," + localFs.makeQualified(base2)); + localFs.mkdirs(base1); + localFs.mkdirs(base2); + + Path in1File1 = new Path(base1, "file1"); + Path in1File2 = new Path(base1, "file2"); + localFs.createNewFile(in1File1); + localFs.createNewFile(in1File2); + + Path in2File1 = new Path(base2, "file1"); + Path in2File2 = new Path(base2, "file2"); + localFs.createNewFile(in2File1); + localFs.createNewFile(in2File2); + List expectedPaths = Lists.newArrayList(in1File1, in1File2, in2File1, + in2File2); + return expectedPaths; + } + + public static List configureTestNestedRecursive(Configuration conf, + FileSystem localFs) throws IOException { + Path base1 = new Path(TEST_ROOT_DIR, "input1"); + conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, + localFs.makeQualified(base1).toString()); + conf.setBoolean( + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE, + true); + localFs.mkdirs(base1); + + Path inDir1 = new Path(base1, "dir1"); + Path inDir2 = new Path(base1, "dir2"); + Path inFile1 = new Path(base1, "file1"); + + Path dir1File1 = new Path(inDir1, "file1"); + Path dir1File2 = new Path(inDir1, "file2"); + + Path dir2File1 = new Path(inDir2, "file1"); + Path dir2File2 = new Path(inDir2, "file2"); + + localFs.mkdirs(inDir1); + localFs.mkdirs(inDir2); + + localFs.createNewFile(inFile1); + localFs.createNewFile(dir1File1); + localFs.createNewFile(dir1File2); + localFs.createNewFile(dir2File1); + localFs.createNewFile(dir2File2); + + List expectedPaths = Lists.newArrayList(inFile1, dir1File1, + dir1File2, dir2File1, dir2File2); + return expectedPaths; + } + + public static List configureTestNestedNonRecursive(Configuration conf, + FileSystem localFs) throws IOException { + Path base1 = new Path(TEST_ROOT_DIR, "input1"); + conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, + localFs.makeQualified(base1).toString()); + conf.setBoolean( + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE, + false); + localFs.mkdirs(base1); + + Path inDir1 = new Path(base1, "dir1"); + Path inDir2 = new Path(base1, "dir2"); + Path inFile1 = new Path(base1, "file1"); + + Path dir1File1 = new Path(inDir1, "file1"); + Path dir1File2 = new Path(inDir1, "file2"); + + Path dir2File1 = new Path(inDir2, "file1"); + Path dir2File2 = new Path(inDir2, "file2"); + + localFs.mkdirs(inDir1); + localFs.mkdirs(inDir2); + + localFs.createNewFile(inFile1); + localFs.createNewFile(dir1File1); + localFs.createNewFile(dir1File2); + localFs.createNewFile(dir2File1); + localFs.createNewFile(dir2File2); + + List expectedPaths = Lists.newArrayList(inFile1, inDir1, inDir2); + return expectedPaths; + } + + public static List configureTestErrorOnNonExistantDir(Configuration conf, + FileSystem localFs) throws IOException { + Path base1 = new Path(TEST_ROOT_DIR, "input1"); + Path base2 = new Path(TEST_ROOT_DIR, "input2"); + conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, + localFs.makeQualified(base1) + "," + localFs.makeQualified(base2)); + conf.setBoolean( + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE, + true); + localFs.mkdirs(base1); + + Path inFile1 = new Path(base1, "file1"); + Path inFile2 = new Path(base1, "file2"); + + localFs.createNewFile(inFile1); + localFs.createNewFile(inFile2); + + List expectedPaths = Lists.newArrayList(); + return expectedPaths; + } + + public static void verifyFileStatuses(List expectedPaths, + List fetchedStatuses, final FileSystem localFs) { + Assert.assertEquals(expectedPaths.size(), fetchedStatuses.size()); + + Iterable fqExpectedPaths = Iterables.transform(expectedPaths, + new Function() { + @Override + public Path apply(Path input) { + return localFs.makeQualified(input); + } + }); + + Set expectedPathSet = Sets.newHashSet(fqExpectedPaths); + for (FileStatus fileStatus : fetchedStatuses) { + if (!expectedPathSet.remove(localFs.makeQualified(fileStatus.getPath()))) { + Assert.fail("Found extra fetched status: " + fileStatus.getPath()); + } + } + Assert.assertEquals( + "Not all expectedPaths matched: " + expectedPathSet.toString(), 0, + expectedPathSet.size()); + } + + + private void verifySplits(List expected, List splits) { + Iterable pathsFromSplits = Iterables.transform(splits, + new Function() { + @Override + public String apply(@Nullable InputSplit input) { + return ((FileSplit) input).getPath().toString(); + } + }); + + Set expectedSet = Sets.newHashSet(expected); + for (String splitPathString : pathsFromSplits) { + if (!expectedSet.remove(splitPathString)) { + Assert.fail("Found extra split: " + splitPathString); + } + } + Assert.assertEquals( + "Not all expectedPaths matched: " + expectedSet.toString(), 0, + expectedSet.size()); + } + private Configuration getConfiguration() { Configuration conf = new Configuration(); conf.set("fs.test.impl.disable.cache", "true");