Merge pull request #2429 from metamx/hadoopClasspath

Make hadoop classpath isolation more explicit
2016-02-10 13:16:51 -08:00 · 2016-02-10 13:16:51 -08:00 · 2b69e4b00f
parent 69a6bdcf03 2bde8b1d68
commit 2b69e4b00f
2 changed files with 98 additions and 7 deletions
--- a/indexing-service/src/main/java/io/druid/indexing/common/task/HadoopTask.java
+++ b/indexing-service/src/main/java/io/druid/indexing/common/task/HadoopTask.java
@ -20,7 +20,9 @@
 package io.druid.indexing.common.task;
 import com.google.common.base.Joiner;
 import com.google.common.base.Predicate;
 import com.google.common.base.Throwables;
 import com.google.common.collect.Collections2;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 import com.google.inject.Injector;
@ -30,11 +32,14 @@ import io.druid.guice.GuiceInjectors;
 import io.druid.indexing.common.TaskToolbox;
 import io.druid.initialization.Initialization;
 import javax.annotation.Nullable;
 import java.io.File;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.net.URLClassLoader;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
@ -69,6 +74,27 @@ public abstract class HadoopTask extends AbstractTask
    return hadoopDependencyCoordinates == null ? null : ImmutableList.copyOf(hadoopDependencyCoordinates);
  }
  // This could stand to have a more robust detection methodology.
  // Right now it just looks for `druid.*\.jar`
  // This is only used for classpath isolation in the runTask isolation stuff, so it shooouuullldddd be ok.
  protected static final Predicate<URL> IS_DRUID_URL = new Predicate<URL>()
  {
    @Override
    public boolean apply(@Nullable URL input)
    {
      try {
        if (input == null) {
          return false;
        }
        final String fName = Paths.get(input.toURI()).getFileName().toString();
        return fName.startsWith("druid") && fName.endsWith(".jar");
      }
      catch (URISyntaxException e) {
        throw Throwables.propagate(e);
      }
    }
  };
  protected ClassLoader buildClassLoader(final TaskToolbox toolbox) throws Exception
  {
    final List<String> finalHadoopDependencyCoordinates = hadoopDependencyCoordinates != null
@ -84,27 +110,35 @@ public abstract class HadoopTask extends AbstractTask
    final List<URL> nonHadoopURLs = Lists.newArrayList();
    nonHadoopURLs.addAll(Arrays.asList(((URLClassLoader) HadoopIndexTask.class.getClassLoader()).getURLs()));
-    final List<URL> driverURLs = Lists.newArrayList();
+    final List<URL> druidURLs = ImmutableList.copyOf(
-    driverURLs.addAll(nonHadoopURLs);
+        Collections2.filter(nonHadoopURLs, IS_DRUID_URL)
    );
    final List<URL> nonHadoopNotDruidURLs = Lists.newArrayList(nonHadoopURLs);
    nonHadoopNotDruidURLs.removeAll(druidURLs);
-    // put hadoop dependencies last to avoid jets3t & apache.httpcore version conflicts
+    // Start from a fresh slate
    ClassLoader classLoader = null;
    classLoader = new URLClassLoader(nonHadoopNotDruidURLs.toArray(new URL[nonHadoopNotDruidURLs.size()]), classLoader);
    // hadoop dependencies come before druid classes because some extensions depend on them
    for (final File hadoopDependency :
        Initialization.getHadoopDependencyFilesToLoad(
            finalHadoopDependencyCoordinates,
            extensionsConfig
        )) {
      final ClassLoader hadoopLoader = Initialization.getClassLoaderForExtension(hadoopDependency);
-      driverURLs.addAll(Arrays.asList(((URLClassLoader) hadoopLoader).getURLs()));
+      classLoader = new URLClassLoader(((URLClassLoader) hadoopLoader).getURLs(), classLoader);
    }
-    final URLClassLoader loader = new URLClassLoader(driverURLs.toArray(new URL[driverURLs.size()]), null);
+    classLoader = new URLClassLoader(druidURLs.toArray(new URL[druidURLs.size()]), classLoader);
    classLoader = new URLClassLoader(extensionURLs.toArray(new URL[extensionURLs.size()]), classLoader);
    final List<URL> jobUrls = Lists.newArrayList();
    jobUrls.addAll(nonHadoopURLs);
    jobUrls.addAll(extensionURLs);
    System.setProperty("druid.hadoop.internal.classpath", Joiner.on(File.pathSeparator).join(jobUrls));
-    return loader;
+    return classLoader;
  }
  /**
--- a/services/src/main/java/io/druid/cli/PullDependencies.java
+++ b/services/src/main/java/io/druid/cli/PullDependencies.java
@ -65,7 +65,64 @@ public class PullDependencies implements Runnable
  private static final Set<String> exclusions = Sets.newHashSet(
      "io.druid",
-      "com.metamx.druid"
+      "com.metamx.druid",
      // It is possible that extensions will pull down a lot of jars that are either
      // duplicates OR conflict with druid jars. In that case, there are two problems that arise
      //
      // 1. Large quantity of jars are passed around to things like hadoop when they are not needed (and should not be included)
      // 2. Classpath priority becomes "mostly correct" and attempted to enforced correctly, but not fully tested
      //
      // These jar groups should be included by druid and *not* pulled down in extensions
      // Note to future developers: This list is hand-crafted and will probably be out of date in the future
      // A good way to know where to look for errant dependencies is to compare the lib/ directory in the distribution
      // tarball with the jars included in the extension directories.
      //
      // This list is best-effort, and might still pull down more than desired.
      //
      // A simple example is that if an extension's dependency uses some-library-123.jar,
      // druid uses some-library-456.jar, and hadoop uses some-library-666.jar, then we probably want to use some-library-456.jar,
      // so don't pull down some-library-123.jar, and ask hadoop to load some-library-456.jar.
      //
      // In the case where some-library is NOT on this list, both some-library-456.jar and some-library-123.jar will be
      // on the class path and propagated around the system. Most places TRY to make sure some-library-456.jar has
      // precedence, but it is easy for this assumption to be violated and for the precedence of some-library-456.jar,
      // some-library-123.jar and some-library-456.jar to not be properly defined.
      //
      // As of this writing there are no special unit tests for classloader issues and library version conflicts.
      //
      // Different tasks which are classloader sensitive attempt to maintain a sane order for loading libraries in the
      // classloader, but it is always possible that something didn't load in the right order. Also we don't want to be
      // throwing around a ton of jars we don't need to.
      "asm",
      "org.ow2.asm",
      "org.jboss.netty",
      "com.google.guava",
      "com.google.code.findbugs",
      "com.google.protobuf",
      "com.esotericsoftware.minlog",
      "log4j",
      "org.slf4j",
      "commons-logging",
      "org.eclipse.jetty",
      "org.mortbay.jetty",
      "com.sun.jersey",
      "com.sun.jersey.contribs",
      "common-beanutils",
      "commons-codec",
      "commons-lang",
      "commons-cli",
      "commons-io",
      "javax.activation",
      "org.apache.httpcomponents",
      "org.apache.zookeeper",
      "org.codehaus.jackson",
      "com.fasterxml.jackson",
      "com.fasterxml.jackson.core",
      "com.fasterxml.jackson.dataformat",
      "com.fasterxml.jackson.datatype",
      "io.netty",
      "org.roaringbitmap"
  );
  private TeslaAether aether;