diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java index 2c73af8e629..a4fa8fdd730 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DiskChecker.java @@ -19,14 +19,23 @@ package org.apache.hadoop.util; import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicReference; +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.io.FileUtils; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Class that provides utility functions for checking disk problem @@ -34,6 +43,8 @@ import org.apache.hadoop.fs.permission.FsPermission; @InterfaceAudience.Private @InterfaceStability.Unstable public class DiskChecker { + public static final Logger LOG = LoggerFactory.getLogger(DiskChecker.class); + public static class DiskErrorException extends IOException { public DiskErrorException(String msg) { super(msg); @@ -49,7 +60,12 @@ public class DiskChecker { super(msg); } } - + + // Provider that abstracts some FileOutputStream operations for + // testability. + private static AtomicReference fileIoProvider = + new AtomicReference<>(new DefaultFileIoProvider()); + /** * Create the directory if it doesn't exist and check that dir is readable, * writable and executable @@ -63,6 +79,7 @@ public class DiskChecker { + dir.toString()); } checkAccessByFileMethods(dir); + doDiskIo(dir); } /** @@ -80,6 +97,7 @@ public class DiskChecker { throws DiskErrorException, IOException { mkdirsWithExistsAndPermissionCheck(localFS, dir, expected); checkAccessByFileMethods(localFS.pathToFile(dir)); + doDiskIo(localFS.pathToFile(dir)); } /** @@ -173,4 +191,143 @@ public class DiskChecker { if (created || !localFS.getFileStatus(dir).getPermission().equals(expected)) localFS.setPermission(dir, expected); } + + // State related to running disk IO checks. + private static final String DISK_IO_FILE_PREFIX = + "DiskChecker.OK_TO_DELETE_."; + + @VisibleForTesting + static final int DISK_IO_MAX_ITERATIONS = 3; + + /** + * Performs some disk IO by writing to a new file in the given directory + * and sync'ing file contents to disk. + * + * This increases the likelihood of catching catastrophic disk/controller + * failures sooner. + * + * @param dir directory to be checked. + * @throws DiskErrorException if we hit an error while trying to perform + * disk IO against the file. + */ + private static void doDiskIo(File dir) throws DiskErrorException { + try { + IOException ioe = null; + + for (int i = 0; i < DISK_IO_MAX_ITERATIONS; ++i) { + final File file = getFileNameForDiskIoCheck(dir, i+1); + try { + diskIoCheckWithoutNativeIo(file); + return; + } catch (IOException e) { + // Let's retry a few times before we really give up and + // declare the disk as bad. + ioe = e; + } + } + throw ioe; // Just rethrow the last exception to signal failure. + } catch(IOException e) { + throw new DiskErrorException("Error checking directory " + dir, e); + } + } + + /** + * Try to perform some disk IO by writing to the given file + * without using Native IO. + * + * @param file + * @throws IOException if there was a non-retriable error. + */ + private static void diskIoCheckWithoutNativeIo(File file) + throws IOException { + FileOutputStream fos = null; + + try { + final FileIoProvider provider = fileIoProvider.get(); + fos = provider.get(file); + provider.write(fos, new byte[1]); + fos.getFD().sync(); + fos.close(); + fos = null; + if (!file.delete() && file.exists()) { + throw new IOException("Failed to delete " + file); + } + file = null; + } finally { + IOUtils.cleanup(null, fos); + FileUtils.deleteQuietly(file); + } + } + + /** + * Generate a path name for a test file under the given directory. + * + * @return file object. + */ + @VisibleForTesting + static File getFileNameForDiskIoCheck(File dir, int iterationCount) { + if (iterationCount < DISK_IO_MAX_ITERATIONS) { + // Use file names of the format prefix.001 by default. + return new File(dir, + DISK_IO_FILE_PREFIX + String.format("%03d", iterationCount)); + } else { + // If the first few checks then fail, try using a randomly generated + // file name. + return new File(dir, DISK_IO_FILE_PREFIX + UUID.randomUUID()); + } + } + + /** + * An interface that abstracts operations on {@link FileOutputStream} + * objects for testability. + */ + interface FileIoProvider { + FileOutputStream get(File f) throws FileNotFoundException; + void write(FileOutputStream fos, byte[] data) throws IOException; + } + + /** + * The default implementation of {@link FileIoProvider}. + */ + private static class DefaultFileIoProvider implements FileIoProvider { + /** + * See {@link FileOutputStream#FileOutputStream(File)}. + */ + @Override + public FileOutputStream get(File f) throws FileNotFoundException { + return new FileOutputStream(f); + } + + /** + * See {@link FileOutputStream#write(byte[])}. + */ + @Override + public void write(FileOutputStream fos, byte[] data) throws IOException { + fos.write(data); + } + } + + /** + * Replace the {@link FileIoProvider} for tests. + * This method MUST NOT be used outside of unit tests. + * + * @param newFosProvider + * @return the old FileIoProvider. + */ + @VisibleForTesting + static FileIoProvider replaceFileOutputStreamProvider( + FileIoProvider newFosProvider) { + return fileIoProvider.getAndSet(newFosProvider); + } + + /** + * Retrieve the current {@link FileIoProvider}. + * This method MUST NOT be used outside of unit tests. + * + * @return the current FileIoProvider. + */ + @VisibleForTesting + static FileIoProvider getFileOutputStreamProvider() { + return fileIoProvider.get(); + } } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java index e2e152aa7ae..40b8d0dd970 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDiskChecker.java @@ -19,7 +19,11 @@ package org.apache.hadoop.util; import java.io.*; import java.nio.file.Files; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.hadoop.util.DiskChecker.FileIoProvider; +import org.junit.After; +import org.junit.Before; import org.junit.Test; import static org.junit.Assert.*; @@ -33,27 +37,46 @@ import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.util.DiskChecker.DiskErrorException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class TestDiskChecker { - final FsPermission defaultPerm = new FsPermission("755"); - final FsPermission invalidPerm = new FsPermission("000"); + public static final Logger LOG = + LoggerFactory.getLogger(TestDiskChecker.class); - @Test (timeout = 30000) + private final FsPermission defaultPerm = new FsPermission("755"); + private final FsPermission invalidPerm = new FsPermission("000"); + + private FileIoProvider fileIoProvider = null; + + @Before + public void setup() { + // Some tests replace the static field DiskChecker#fileIoProvider. + // Cache it so we can restore it after each test completes. + fileIoProvider = DiskChecker.getFileOutputStreamProvider(); + } + + @After + public void cleanup() { + DiskChecker.replaceFileOutputStreamProvider(fileIoProvider); + } + + @Test(timeout = 30000) public void testMkdirs_dirExists() throws Throwable { _mkdirs(true, defaultPerm, defaultPerm); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testMkdirs_noDir() throws Throwable { _mkdirs(false, defaultPerm, defaultPerm); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testMkdirs_dirExists_badUmask() throws Throwable { _mkdirs(true, defaultPerm, invalidPerm); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testMkdirs_noDir_badUmask() throws Throwable { _mkdirs(false, defaultPerm, invalidPerm); } @@ -78,34 +101,33 @@ public class TestDiskChecker { verify(fs).getFileStatus(dir); verify(stat).getPermission(); } - } - catch (DiskErrorException e) { + } catch (DiskErrorException e) { if (before != after) assertTrue(e.getMessage().startsWith("Incorrect permission")); } } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_normal() throws Throwable { _checkDirs(true, new FsPermission("755"), true); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notDir() throws Throwable { _checkDirs(false, new FsPermission("000"), false); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notReadable() throws Throwable { _checkDirs(true, new FsPermission("000"), false); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notWritable() throws Throwable { _checkDirs(true, new FsPermission("444"), false); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notListable() throws Throwable { _checkDirs(true, new FsPermission("666"), false); // not listable } @@ -134,13 +156,15 @@ public class TestDiskChecker { throws Throwable { File localDir = isDir ? createTempDir() : createTempFile(); Shell.execCommand(Shell.getSetPermissionCommand(String.format("%04o", - perm.toShort()), false, localDir.getAbsolutePath())); + perm.toShort()), false, localDir.getAbsolutePath())); try { DiskChecker.checkDir(FileSystem.getLocal(new Configuration()), - new Path(localDir.getAbsolutePath()), perm); - assertTrue("checkDir success", success); + new Path(localDir.getAbsolutePath()), perm); + assertTrue("checkDir success, expected failure", success); } catch (DiskErrorException e) { - assertFalse("checkDir success", success); + if (success) { + throw e; // Unexpected exception! + } } localDir.delete(); } @@ -150,27 +174,27 @@ public class TestDiskChecker { * permission for result of mapper. */ - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_normal_local() throws Throwable { checkDirs(true, "755", true); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notDir_local() throws Throwable { checkDirs(false, "000", false); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notReadable_local() throws Throwable { checkDirs(true, "000", false); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notWritable_local() throws Throwable { checkDirs(true, "444", false); } - @Test (timeout = 30000) + @Test(timeout = 30000) public void testCheckDir_notListable_local() throws Throwable { checkDirs(true, "666", false); } @@ -179,16 +203,116 @@ public class TestDiskChecker { throws Throwable { File localDir = isDir ? createTempDir() : createTempFile(); Shell.execCommand(Shell.getSetPermissionCommand(perm, false, - localDir.getAbsolutePath())); + localDir.getAbsolutePath())); try { DiskChecker.checkDir(localDir); - assertTrue("checkDir success", success); + assertTrue("checkDir success, expected failure", success); } catch (DiskErrorException e) { - e.printStackTrace(); - assertFalse("checkDir success", success); + if (success) { + throw e; // Unexpected exception! + } } localDir.delete(); - System.out.println("checkDir success: " + success); + } + /** + * Verify DiskChecker ignores at least 2 transient file creation errors. + */ + @Test(timeout = 30000) + public void testDiskIoIgnoresTransientCreateErrors() throws Throwable { + DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider( + DiskChecker.DISK_IO_MAX_ITERATIONS - 1, 0)); + checkDirs(true, "755", true); + } + + /** + * Verify DiskChecker bails after 3 file creation errors. + */ + @Test(timeout = 30000) + public void testDiskIoDetectsCreateErrors() throws Throwable { + DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider( + DiskChecker.DISK_IO_MAX_ITERATIONS, 0)); + checkDirs(true, "755", false); + } + + /** + * Verify DiskChecker ignores at least 2 transient file write errors. + */ + @Test(timeout = 30000) + public void testDiskIoIgnoresTransientWriteErrors() throws Throwable { + DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider( + 0, DiskChecker.DISK_IO_MAX_ITERATIONS - 1)); + checkDirs(true, "755", true); + } + + /** + * Verify DiskChecker bails after 3 file write errors. + */ + @Test(timeout = 30000) + public void testDiskIoDetectsWriteErrors() throws Throwable { + DiskChecker.replaceFileOutputStreamProvider(new TestFileIoProvider( + 0, DiskChecker.DISK_IO_MAX_ITERATIONS)); + checkDirs(true, "755", false); + } + + /** + * Verify DiskChecker's test file naming scheme. + */ + @Test(timeout = 30000) + public void testDiskIoFileNaming() throws Throwable { + final File rootDir = new File("/"); + assertTrue(".001".matches("\\.00\\d$")); + for (int i = 1; i < DiskChecker.DISK_IO_MAX_ITERATIONS; ++i) { + final File file = DiskChecker.getFileNameForDiskIoCheck(rootDir, i); + assertTrue( + "File name does not match expected pattern: " + file, + file.toString().matches("^.*\\.[0-9]+$")); + } + final File guidFile = DiskChecker.getFileNameForDiskIoCheck( + rootDir, DiskChecker.DISK_IO_MAX_ITERATIONS); + assertTrue( + "File name does not match expected pattern: " + guidFile, + guidFile.toString().matches("^.*\\.[A-Za-z0-9-]+$")); + } + + /** + * A dummy {@link DiskChecker#FileIoProvider} that can throw a programmable + * number of times. + */ + private static class TestFileIoProvider implements FileIoProvider { + private final AtomicInteger numCreateCalls = new AtomicInteger(0); + private final AtomicInteger numWriteCalls = new AtomicInteger(0); + + private final int numTimesToThrowOnCreate; + private final int numTimesToThrowOnWrite; + + public TestFileIoProvider( + int numTimesToThrowOnCreate, int numTimesToThrowOnWrite) { + this.numTimesToThrowOnCreate = numTimesToThrowOnCreate; + this.numTimesToThrowOnWrite = numTimesToThrowOnWrite; + } + + /** + * {@inheritDoc} + */ + @Override + public FileOutputStream get(File f) throws FileNotFoundException { + if (numCreateCalls.getAndIncrement() < numTimesToThrowOnCreate) { + throw new FileNotFoundException("Dummy exception for testing"); + } + // Can't mock final class FileOutputStream. + return new FileOutputStream(f); + } + + /** + * {@inheritDoc} + */ + @Override + public void write(FileOutputStream fos, byte[] data) throws IOException { + if (numWriteCalls.getAndIncrement() < numTimesToThrowOnWrite) { + throw new IOException("Dummy exception for testing"); + } + fos.write(data); + } } }