1) Add support for storing segments in HDFS

2013-04-30 12:47:43 -05:00 · 2013-04-30 12:47:43 -05:00 · 60b279b0d3
parent cd535fcd79
commit 60b279b0d3
9 changed files with 288 additions and 24 deletions
--- a/common/src/main/java/com/metamx/druid/utils/CompressionUtils.java
+++ b/common/src/main/java/com/metamx/druid/utils/CompressionUtils.java
@ -46,32 +46,49 @@ public class CompressionUtils
  public static long zip(File directory, File outputZipFile) throws IOException
  {
    if (!directory.isDirectory()) {
      throw new IOException(String.format("directory[%s] is not a directory", directory));
    }
    if (!outputZipFile.getName().endsWith(".zip")) {
      log.warn("No .zip suffix[%s], putting files from [%s] into it anyway.", outputZipFile, directory);
    }
    final FileOutputStream out = new FileOutputStream(outputZipFile);
    try {
      final long retVal = zip(directory, out);
      out.close();
      return retVal;
    }
    finally {
      Closeables.closeQuietly(out);
    }
  }
  public static long zip(File directory, OutputStream out) throws IOException
  {
    if (!directory.isDirectory()) {
      throw new IOException(String.format("directory[%s] is not a directory", directory));
    }
    long totalSize = 0;
    ZipOutputStream zipOut = null;
    try {
-      zipOut = new ZipOutputStream(new FileOutputStream(outputZipFile));
+      zipOut = new ZipOutputStream(out);
      File[] files = directory.listFiles();
      for (File file : files) {
        log.info("Adding file[%s] with size[%,d].  Total size so far[%,d]", file, file.length(), totalSize);
        if (file.length() >= Integer.MAX_VALUE) {
-          zipOut.close();
+          zipOut.finish();
          outputZipFile.delete();
          throw new IOException(String.format("file[%s] too large [%,d]", file, file.length()));
        }
        zipOut.putNextEntry(new ZipEntry(file.getName()));
        totalSize += ByteStreams.copy(Files.newInputStreamSupplier(file), zipOut);
      }
      zipOut.closeEntry();
    }
    finally {
-      Closeables.closeQuietly(zipOut);
+      if (zipOut != null) {
        zipOut.finish();
      }
    }
    return totalSize;
@ -100,11 +117,12 @@ public class CompressionUtils
    ZipEntry entry;
    while ((entry = zipIn.getNextEntry()) != null) {
-      OutputStream out = null;
+      FileOutputStream out = null;
      try {
        out = new FileOutputStream(new File(outDir, entry.getName()));
        ByteStreams.copy(zipIn, out);
        zipIn.closeEntry();
        out.close();
      }
      finally {
        Closeables.closeQuietly(out);
--- a/indexer/pom.xml
+++ b/indexer/pom.xml
@ -70,13 +70,6 @@
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>0.20.2</version>
            <exclusions>
                <exclusion>
                    <groupId>org.mortbay.jetty</groupId>
                    <artifactId>servlet-api-2.5</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
--- a/indexer/src/main/java/com/metamx/druid/indexer/IndexGeneratorJob.java
+++ b/indexer/src/main/java/com/metamx/druid/indexer/IndexGeneratorJob.java
@ -19,6 +19,7 @@
 package com.metamx.druid.indexer;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Optional;
 import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
@ -47,6 +48,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3native.NativeS3FileSystem;
 import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.InvalidJobConfException;
@ -58,7 +60,6 @@ import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.joda.time.DateTime;
 import org.joda.time.Interval;
@ -417,6 +418,11 @@ public class IndexGeneratorJob implements Jobby
            "type", "local",
            "path", indexOutURI.getPath()
        );
      } else if (outputFS instanceof DistributedFileSystem) {
        loadSpec = ImmutableMap.<String, Object>of(
            "type", "hdfs",
            "path", indexOutURI.getPath()
        );
      } else {
        throw new ISE("Unknown file system[%s]", outputFS.getClass());
      }
--- a/pom.xml
+++ b/pom.xml
@ -138,6 +138,17 @@
                <artifactId>curator-x-discovery</artifactId>
                <version>${netflix.curator.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-core</artifactId>
                <version>0.20.2</version>
                <exclusions>
                    <exclusion>
                        <groupId>org.mortbay.jetty</groupId>
                        <artifactId>servlet-api-2.5</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
            <dependency>
                <groupId>it.uniroma3.mat</groupId>
                <artifactId>extendedset</artifactId>
--- a/server/pom.xml
+++ b/server/pom.xml
@ -168,6 +168,10 @@
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
        </dependency>
        <!-- Dependencies required for jets3t b/c emr pom doesn't include them -->
        <dependency>
--- a/server/src/main/java/com/metamx/druid/initialization/ServerInit.java
+++ b/server/src/main/java/com/metamx/druid/initialization/ServerInit.java
@ -27,8 +27,13 @@ import com.google.common.collect.Maps;
 import com.metamx.common.ISE;
 import com.metamx.common.logger.Logger;
 import com.metamx.druid.DruidProcessingConfig;
 import com.metamx.druid.Query;
 import com.metamx.druid.collect.StupidPool;
 import com.metamx.druid.loading.DataSegmentPusher;
 import com.metamx.druid.loading.DelegatingSegmentLoader;
 import com.metamx.druid.loading.HdfsDataSegmentPuller;
 import com.metamx.druid.loading.HdfsDataSegmentPusher;
 import com.metamx.druid.loading.HdfsDataSegmentPusherConfig;
 import com.metamx.druid.loading.LocalDataSegmentPuller;
 import com.metamx.druid.loading.LocalDataSegmentPusher;
 import com.metamx.druid.loading.LocalDataSegmentPusherConfig;
@ -37,15 +42,13 @@ import com.metamx.druid.loading.QueryableIndexFactory;
 import com.metamx.druid.loading.S3DataSegmentPuller;
 import com.metamx.druid.loading.S3DataSegmentPusher;
 import com.metamx.druid.loading.S3DataSegmentPusherConfig;
 import com.metamx.druid.loading.SegmentLoader;
 import com.metamx.druid.loading.SegmentLoaderConfig;
 import com.metamx.druid.loading.SingleSegmentLoader;
 import com.metamx.druid.query.group.GroupByQueryEngine;
 import com.metamx.druid.query.group.GroupByQueryEngineConfig;
 import com.metamx.druid.Query;
 import com.metamx.druid.collect.StupidPool;
 import com.metamx.druid.loading.SegmentLoader;
 import com.metamx.druid.query.QueryRunnerFactory;
 import com.metamx.druid.query.group.GroupByQuery;
 import com.metamx.druid.query.group.GroupByQueryEngine;
 import com.metamx.druid.query.group.GroupByQueryEngineConfig;
 import com.metamx.druid.query.group.GroupByQueryRunnerFactory;
 import com.metamx.druid.query.group.GroupByQueryRunnerFactoryConfig;
 import com.metamx.druid.query.metadata.SegmentMetadataQuery;
@ -57,6 +60,7 @@ import com.metamx.druid.query.timeboundary.TimeBoundaryQueryRunnerFactory;
 import com.metamx.druid.query.timeseries.TimeseriesQuery;
 import com.metamx.druid.query.timeseries.TimeseriesQueryRunnerFactory;
 import com.metamx.druid.utils.PropUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.jets3t.service.S3ServiceException;
 import org.jets3t.service.impl.rest.httpclient.RestS3Service;
 import org.jets3t.service.security.AWSCredentials;
@ -85,13 +89,13 @@ public class ServerInit
    final QueryableIndexFactory factory = new MMappedQueryableIndexFactory();
    SingleSegmentLoader s3segmentLoader = new SingleSegmentLoader(segmentGetter, factory, config);
    SingleSegmentLoader localSegmentLoader = new SingleSegmentLoader(new LocalDataSegmentPuller(), factory, config);
    delegateLoader.setLoaderTypes(
        ImmutableMap.<String, SegmentLoader>builder()
                    .put("s3", s3segmentLoader)
                    .put("s3_zip", s3segmentLoader)
-                    .put("local", localSegmentLoader)
+                    .put("local", new SingleSegmentLoader(new LocalDataSegmentPuller(), factory, config))
                    .put("hdfs", new SingleSegmentLoader(new HdfsDataSegmentPuller(new Configuration()), factory, config))
                    .build()
    );
@ -167,6 +171,11 @@ public class ServerInit
    if (Boolean.parseBoolean(props.getProperty("druid.pusher.local", "false"))) {
      return new LocalDataSegmentPusher(configFactory.build(LocalDataSegmentPusherConfig.class), jsonMapper);
    }
    else if (Boolean.parseBoolean(props.getProperty("druid.pusher.hdfs", "false"))) {
      final HdfsDataSegmentPusherConfig config = configFactory.build(HdfsDataSegmentPusherConfig.class);
      return new HdfsDataSegmentPusher(config, new Configuration(), jsonMapper);
    }
    else {
      final RestS3Service s3Client;
--- a/server/src/main/java/com/metamx/druid/loading/HdfsDataSegmentPuller.java
+++ b/server/src/main/java/com/metamx/druid/loading/HdfsDataSegmentPuller.java
@ -0,0 +1,85 @@
 package com.metamx.druid.loading;
 import com.google.common.io.Closeables;
 import com.metamx.druid.client.DataSegment;
 import com.metamx.druid.utils.CompressionUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import java.io.File;
 import java.io.IOException;
 /**
 */
 public class HdfsDataSegmentPuller implements DataSegmentPuller
 {
  private final Configuration config;
  public HdfsDataSegmentPuller(final Configuration config)
  {
    this.config = config;
  }
  @Override
  public void getSegmentFiles(DataSegment segment, File dir) throws SegmentLoadingException
  {
    final Path path = getPath(segment);
    final FileSystem fs = checkPathAndGetFilesystem(path);
    FSDataInputStream in = null;
    try {
      if (path.getName().endsWith(".zip")) {
        in = fs.open(path);
        CompressionUtils.unzip(in, dir);
        in.close();
      }
      else {
        throw new SegmentLoadingException("Unknown file type[%s]", path);
      }
    }
    catch (IOException e) {
      throw new SegmentLoadingException(e, "Some IOException");
    }
    finally {
      Closeables.closeQuietly(in);
    }
  }
  @Override
  public long getLastModified(DataSegment segment) throws SegmentLoadingException
  {
    Path path = getPath(segment);
    FileSystem fs = checkPathAndGetFilesystem(path);
    try {
      return fs.getFileStatus(path).getModificationTime();
    }
    catch (IOException e) {
      throw new SegmentLoadingException(e, "Problem loading status of path[%s]", path);
    }
  }
  private Path getPath(DataSegment segment) {
    return new Path(String.valueOf(segment.getLoadSpec().get("path")));
  }
  private FileSystem checkPathAndGetFilesystem(Path path) throws SegmentLoadingException
  {
    FileSystem fs;
    try {
      fs = path.getFileSystem(config);
      if (!fs.exists(path)) {
        throw new SegmentLoadingException("Path[%s] doesn't exist.", path);
      }
      return fs;
    }
    catch (IOException e) {
      throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", path);
    }
  }
 }
--- a/server/src/main/java/com/metamx/druid/loading/HdfsDataSegmentPusher.java
+++ b/server/src/main/java/com/metamx/druid/loading/HdfsDataSegmentPusher.java
@ -0,0 +1,106 @@
 package com.metamx.druid.loading;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.io.ByteStreams;
 import com.google.common.io.Closeables;
 import com.google.common.io.OutputSupplier;
 import com.metamx.common.logger.Logger;
 import com.metamx.druid.client.DataSegment;
 import com.metamx.druid.index.v1.IndexIO;
 import com.metamx.druid.utils.CompressionUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
 /**
 */
 public class HdfsDataSegmentPusher implements DataSegmentPusher
 {
  private static final Logger log = new Logger(HdfsDataSegmentPusher.class);
  private final HdfsDataSegmentPusherConfig config;
  private final Configuration hadoopConfig;
  private final ObjectMapper jsonMapper;
  public HdfsDataSegmentPusher(
      HdfsDataSegmentPusherConfig config,
      Configuration hadoopConfig,
      ObjectMapper jsonMapper
  )
  {
    this.config = config;
    this.hadoopConfig = hadoopConfig;
    this.jsonMapper = jsonMapper;
  }
  @Override
  public DataSegment push(File inDir, DataSegment segment) throws IOException
  {
    final String storageDir = DataSegmentPusherUtil.getStorageDir(segment);
    Path outFile = new Path(String.format("%s/%s/index.zip", config.getStorageDirectory(), storageDir));
    FileSystem fs = outFile.getFileSystem(hadoopConfig);
    fs.mkdirs(outFile.getParent());
    log.info("Compressing files from[%s] to [%s]", inDir, outFile);
    FSDataOutputStream out = null;
    long size;
    try {
      out = fs.create(outFile);
      size = CompressionUtils.zip(inDir, out);
      out.close();
    }
    finally {
      Closeables.closeQuietly(out);
    }
    return createDescriptorFile(
        segment.withLoadSpec(makeLoadSpec(outFile))
               .withSize(size)
               .withBinaryVersion(IndexIO.CURRENT_VERSION_ID),
        outFile.getParent(),
        fs
    );
  }
  private DataSegment createDescriptorFile(DataSegment segment, Path outDir, final FileSystem fs) throws IOException
  {
    final Path descriptorFile = new Path(outDir, "descriptor.json");
    log.info("Creating descriptor file at[%s]", descriptorFile);
    ByteStreams.copy(
        ByteStreams.newInputStreamSupplier(jsonMapper.writeValueAsBytes(segment)),
        new HdfsOutputStreamSupplier(fs, descriptorFile)
    );
    return segment;
  }
  private ImmutableMap<String, Object> makeLoadSpec(Path outFile)
  {
    return ImmutableMap.<String, Object>of("type", "hdfs", "path", outFile.toString());
  }
  private static class HdfsOutputStreamSupplier implements OutputSupplier<OutputStream>
  {
    private final FileSystem fs;
    private final Path descriptorFile;
    public HdfsOutputStreamSupplier(FileSystem fs, Path descriptorFile)
    {
      this.fs = fs;
      this.descriptorFile = descriptorFile;
    }
    @Override
    public OutputStream getOutput() throws IOException
    {
      return fs.create(descriptorFile);
    }
  }
 }
--- a/server/src/main/java/com/metamx/druid/loading/HdfsDataSegmentPusherConfig.java
+++ b/server/src/main/java/com/metamx/druid/loading/HdfsDataSegmentPusherConfig.java
@ -0,0 +1,32 @@
 /*
 * Druid - a distributed column store.
 * Copyright (C) 2012  Metamarkets Group Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
 package com.metamx.druid.loading;
 import org.skife.config.Config;
 import java.io.File;
 /**
 */
 public abstract class HdfsDataSegmentPusherConfig
 {
  @Config("druid.pusher.hdfs.storageDirectory")
  public abstract File getStorageDirectory();
 }