From ca11ffa5ded7d904b75e0049cb5d9720c453b57c Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 10 Feb 2015 17:27:21 +0000 Subject: [PATCH] YARN-2809. Implement workaround for linux kernel panic when removing cgroup. Contributed by Nathan Roberts (cherry picked from commit 3f5431a22fcef7e3eb9aceeefe324e5b7ac84049) --- hadoop-yarn-project/CHANGES.txt | 3 + .../hadoop/yarn/conf/YarnConfiguration.java | 11 ++- .../util/CgroupsLCEResourcesHandler.java | 73 ++++++++++++++--- .../util/TestCgroupsLCEResourcesHandler.java | 80 ++++++++++++++----- 4 files changed, 136 insertions(+), 31 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 2f777a535d3..22627406db0 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -496,6 +496,9 @@ Release 2.7.0 - UNRELEASED YARN-3090. DeletionService can silently ignore deletion task failures (Varun Saxena via jlowe) + YARN-2809. Implement workaround for linux kernel panic when removing + cgroup (Nathan Roberts via jlowe) + Release 2.6.0 - 2014-11-18 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index d50a7009f1e..544ae1b3e64 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1002,7 +1002,16 @@ public class YarnConfiguration extends Configuration { public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT = 1000; - + + /** + * Delay between attempts to remove linux cgroup. + */ + public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY = + NM_PREFIX + "linux-container-executor.cgroups.delete-delay-ms"; + + public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY = + 20; + /** /* The Windows group that the windows-secure-container-executor should run as. */ diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java index a832a7aafcf..ffa17ac173d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java @@ -22,6 +22,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; @@ -37,6 +38,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.annotations.VisibleForTesting; + import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -75,6 +77,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler { private final Map controllerPaths; // Controller -> path private long deleteCgroupTimeout; + private long deleteCgroupDelay; // package private for testing purposes Clock clock; @@ -108,6 +111,9 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler { this.deleteCgroupTimeout = conf.getLong( YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT, YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT); + this.deleteCgroupDelay = + conf.getLong(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY, + YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY); // remove extra /'s at end or start of cgroupPrefix if (cgroupPrefix.charAt(0) == '/') { cgroupPrefix = cgroupPrefix.substring(1); @@ -271,23 +277,71 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler { } } + /* + * Utility routine to print first line from cgroup tasks file + */ + private void logLineFromTasksFile(File cgf) { + String str; + if (LOG.isDebugEnabled()) { + try (BufferedReader inl = + new BufferedReader(new InputStreamReader(new FileInputStream(cgf + + "/tasks"), "UTF-8"))) { + if ((str = inl.readLine()) != null) { + LOG.debug("First line in cgroup tasks file: " + cgf + " " + str); + } + } catch (IOException e) { + LOG.warn("Failed to read cgroup tasks file. ", e); + } + } + } + + /** + * If tasks file is empty, delete the cgroup. + * + * @param file object referring to the cgroup to be deleted + * @return Boolean indicating whether cgroup was deleted + */ + @VisibleForTesting + boolean checkAndDeleteCgroup(File cgf) throws InterruptedException { + boolean deleted = false; + // FileInputStream in = null; + try (FileInputStream in = new FileInputStream(cgf + "/tasks")) { + if (in.read() == -1) { + /* + * "tasks" file is empty, sleep a bit more and then try to delete the + * cgroup. Some versions of linux will occasionally panic due to a race + * condition in this area, hence the paranoia. + */ + Thread.sleep(deleteCgroupDelay); + deleted = cgf.delete(); + if (!deleted) { + LOG.warn("Failed attempt to delete cgroup: " + cgf); + } + } else { + logLineFromTasksFile(cgf); + } + } catch (IOException e) { + LOG.warn("Failed to read cgroup tasks file. ", e); + } + return deleted; + } + @VisibleForTesting boolean deleteCgroup(String cgroupPath) { - boolean deleted; - + boolean deleted = false; + if (LOG.isDebugEnabled()) { LOG.debug("deleteCgroup: " + cgroupPath); } - long start = clock.getTime(); do { - deleted = new File(cgroupPath).delete(); - if (!deleted) { - try { - Thread.sleep(20); - } catch (InterruptedException ex) { - // NOP + try { + deleted = checkAndDeleteCgroup(new File(cgroupPath)); + if (!deleted) { + Thread.sleep(deleteCgroupDelay); } + } catch (InterruptedException ex) { + // NOP } } while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout); @@ -295,7 +349,6 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler { LOG.warn("Unable to delete cgroup at: " + cgroupPath + ", tried to delete for " + deleteCgroupTimeout + "ms"); } - return deleted; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java index d0bceee1eb0..4e35169bbdd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java @@ -26,6 +26,8 @@ import org.junit.Assert; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.util.Clock; import org.junit.Test; +import org.junit.After; +import org.junit.Before; import org.mockito.Mockito; import java.io.*; @@ -35,6 +37,7 @@ import java.util.UUID; import java.util.concurrent.CountDownLatch; public class TestCgroupsLCEResourcesHandler { + static File cgroupDir = null; static class MockClock implements Clock { long time; @@ -43,6 +46,51 @@ public class TestCgroupsLCEResourcesHandler { return time; } } + + @Before + public void setUp() throws Exception { + cgroupDir = + new File(System.getProperty("test.build.data", + System.getProperty("java.io.tmpdir", "target")), this.getClass() + .getName()); + FileUtils.deleteQuietly(cgroupDir); + } + + @After + public void tearDown() throws Exception { + FileUtils.deleteQuietly(cgroupDir); + } + + @Test + public void testcheckAndDeleteCgroup() throws Exception { + CgroupsLCEResourcesHandler handler = new CgroupsLCEResourcesHandler(); + handler.setConf(new YarnConfiguration()); + handler.initConfig(); + + FileUtils.deleteQuietly(cgroupDir); + // Test 0 + // tasks file not present, should return false + Assert.assertFalse(handler.checkAndDeleteCgroup(cgroupDir)); + + File tfile = new File(cgroupDir.getAbsolutePath(), "tasks"); + FileOutputStream fos = FileUtils.openOutputStream(tfile); + File fspy = Mockito.spy(cgroupDir); + + // Test 1, tasks file is empty + // tasks file has no data, should return true + Mockito.stub(fspy.delete()).toReturn(true); + Assert.assertTrue(handler.checkAndDeleteCgroup(fspy)); + + // Test 2, tasks file has data + fos.write("1234".getBytes()); + fos.close(); + // tasks has data, would not be able to delete, should return false + Assert.assertFalse(handler.checkAndDeleteCgroup(fspy)); + FileUtils.deleteQuietly(cgroupDir); + + } + + // Verify DeleteCgroup times out if "tasks" file contains data @Test public void testDeleteCgroup() throws Exception { final MockClock clock = new MockClock(); @@ -51,13 +99,15 @@ public class TestCgroupsLCEResourcesHandler { handler.setConf(new YarnConfiguration()); handler.initConfig(); handler.clock = clock; - - //file exists - File file = new File("target", UUID.randomUUID().toString()); - new FileOutputStream(file).close(); - Assert.assertTrue(handler.deleteCgroup(file.getPath())); - //file does not exists, timing out + FileUtils.deleteQuietly(cgroupDir); + + // Create a non-empty tasks file + File tfile = new File(cgroupDir.getAbsolutePath(), "tasks"); + FileOutputStream fos = FileUtils.openOutputStream(tfile); + fos.write("1234".getBytes()); + fos.close(); + final CountDownLatch latch = new CountDownLatch(1); new Thread() { @Override @@ -73,8 +123,8 @@ public class TestCgroupsLCEResourcesHandler { } }.start(); latch.await(); - file = new File("target", UUID.randomUUID().toString()); - Assert.assertFalse(handler.deleteCgroup(file.getPath())); + Assert.assertFalse(handler.deleteCgroup(cgroupDir.getAbsolutePath())); + FileUtils.deleteQuietly(cgroupDir); } static class MockLinuxContainerExecutor extends LinuxContainerExecutor { @@ -122,7 +172,6 @@ public class TestCgroupsLCEResourcesHandler { handler.initConfig(); // create mock cgroup - File cgroupDir = createMockCgroup(); File cgroupMountDir = createMockCgroupMount(cgroupDir); // create mock mtab @@ -202,18 +251,10 @@ public class TestCgroupsLCEResourcesHandler { Assert.assertEquals(-1, ret[1]); } - private File createMockCgroup() throws IOException { - File cgroupDir = new File("target", UUID.randomUUID().toString()); - if (!cgroupDir.mkdir()) { - String message = "Could not create dir " + cgroupDir.getAbsolutePath(); - throw new IOException(message); - } - return cgroupDir; - } - private File createMockCgroupMount(File cgroupDir) throws IOException { File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn"); - if (!cgroupMountDir.mkdir()) { + FileUtils.deleteQuietly(cgroupDir); + if (!cgroupMountDir.mkdirs()) { String message = "Could not create dir " + cgroupMountDir.getAbsolutePath(); throw new IOException(message); @@ -253,7 +294,6 @@ public class TestCgroupsLCEResourcesHandler { handler.initConfig(); // create mock cgroup - File cgroupDir = createMockCgroup(); File cgroupMountDir = createMockCgroupMount(cgroupDir); // create mock mtab