diff --git a/docs/content/Hadoop-Configuration.md b/docs/content/Hadoop-Configuration.md new file mode 100644 index 00000000000..296558fcee7 --- /dev/null +++ b/docs/content/Hadoop-Configuration.md @@ -0,0 +1,360 @@ +--- +layout: doc_page +--- + +Example Production Hadoop Configuration +======================================= + +The following configuration should work relatively well for Druid indexing and Hadoop. In the example, we are using EC2 cc2.8xlarge nodes. + +### Core-site.xml + +``` + + + + + hadoop.tmp.dir + /mnt/persistent/hadoop + + + + + fs.defaultFS + hdfs://#{IP}:9000 + + + fs.s3.impl + org.apache.hadoop.fs.s3native.NativeS3FileSystem + + + fs.s3.awsAccessKeyId + #{S3_ACCESS_KEY} + + + fs.s3.awsSecretAccessKey + #{S3_SECRET_KEY} + + + fs.s3.buffer.dir + /mnt/persistent/hadoop-s3n + + + fs.s3n.awsAccessKeyId + #{S3N_ACCESS_KEY} + + + fs.s3n.awsSecretAccessKey + #{S3N_SECRET_KEY} + + + + + io.compression.codecs + org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.Lz4Codec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec + + + + + io.seqfile.local.dir + /mnt/persistent/hadoop/io/local + + + +``` + +### Mapred-site.xml + +``` + + + + mapreduce.framework.name + yarn + + + + mapreduce.jobtracker.address + #{JT_ADDR}:9001 + + + mapreduce.jobtracker.http.address + #{JT_HTTP_ADDR}:9100 + + + mapreduce.jobhistory.address + #{JH_ADDR}:10020 + + + mapreduce.jobhistory.webapp.address + #{JH_WEBAPP_ADDR}:19888 + + + mapreduce.tasktracker.http.address + #{TT_ADDR}:9103 + + + + + mapreduce.job.reduces + 21 + + + + mapreduce.job.jvm.numtasks + 20 + + + mapreduce.map.memory.mb + 2048 + + + mapreduce.map.java.opts + -server -Xmx1536m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps + + + mapreduce.reduce.memory.mb + 6144 + + + mapreduce.reduce.java.opts + -server -Xmx2560m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps + + + mapreduce.reduce.shuffle.parallelcopies + 50 + + + mapreduce.reduce.shuffle.input.buffer.percent + 0.5 + + + mapreduce.task.io.sort.mb + 256 + + + mapreduce.task.io.sort.factor + 100 + + + mapreduce.jobtracker.handler.count + 64 + + + mapreduce.tasktracker.http.threads + 20 + + + + + mapreduce.cluster.local.dir + /mnt/persistent/hadoop/mapred/local + + + + + mapreduce.jobhistory.recovery.enable + true + + + mapreduce.jobhistory.recovery.store.class + org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService + + + mapreduce.jobhistory.recovery.store.fs.uri + file://${hadoop.tmp.dir}/mapred-jobhistory-state + + + + + + mapreduce.output.fileoutputformat.compress + false + + + mapreduce.map.output.compress + true + + + mapreduce.output.fileoutputformat.compress.type + BLOCK + + + mapreduce.map.output.compress.codec + org.apache.hadoop.io.compress.Lz4Codec + + + mapreduce.output.fileoutputformat.compress.codec + org.apache.hadoop.io.compress.GzipCodec + + + + mapreduce.map.speculative + false + + + mapreduce.reduce.speculative + false + + + + + mapreduce.task.timeout + 1800000 + + + +``` + +### Yarn-site.xml + +``` + + + + yarn.resourcemanager.hostname + #{RM_HOSTNAME} + + + yarn.resourcemanager.scheduler.class + org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.log-aggregation-enable + true + + + yarn.log.server.url + http://#{IP_LOG_SERVER}:19888/jobhistory/logs/ + + + yarn.nodemanager.hostname + #{IP_ADDR} + + + yarn.scheduler.minimum-allocation-mb + 512 + + + yarn.nodemanager.resource.memory-mb + 1024 + + + yarn.nodemanager.resource.cpu-vcores + 1 + + + yarn.nodemanager.vmem-check-enabled + false + + + + + yarn.nodemanager.local-dirs + /mnt/persistent/hadoop/nm-local-dir + + + + + yarn.resourcemanager.recovery.enabled + false + + + yarn.resourcemanager.store.class + org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore + + + yarn.resourcemanager.fs.state-store.uri + file://${hadoop.tmp.dir}/yarn-resourcemanager-state + + + + + yarn.resourcemanager.nodes.exclude-path + /mnt/persistent/hadoop/yarn-exclude.txt + + + +``` + +### HDFS-site.xml + +``` + + + + dfs.replication + 3 + + + dfs.namenode.datanode.registration.ip-hostname-check + false + + + dfs.hosts.exclude + /mnt/persistent/hadoop/hdfs-exclude.txt + + + + + dfs.datanode.data.dir + file:///mnt/persistent/hadoop/dfs/data + + + +``` + +### Capacity-scheduler.xml + +``` + + + + yarn.scheduler.capacity.maximum-am-resource-percent + 0.1 + + + yarn.scheduler.capacity.root.queues + default + + + yarn.scheduler.capacity.root.default.capacity + 100 + + + yarn.scheduler.capacity.root.default.user-limit-factor + 1 + + + yarn.scheduler.capacity.root.queues + default + + + yarn.scheduler.capacity.root.default.maximum-capacity + 100 + + + yarn.scheduler.capacity.root.default.state + RUNNING + + + yarn.scheduler.capacity.root.default.acl_submit_applications + * + + + yarn.scheduler.capacity.root.default.acl_administer_queue + * + + + yarn.scheduler.capacity.node-locality-delay + -1 + + + +``` \ No newline at end of file diff --git a/docs/content/toc.textile b/docs/content/toc.textile index d811e68b55a..cc3c86eac99 100644 --- a/docs/content/toc.textile +++ b/docs/content/toc.textile @@ -17,6 +17,7 @@ h2. Getting Started h2. Booting a Druid Cluster * "Simple Cluster Configuration":Simple-Cluster-Configuration.html * "Production Cluster Configuration":Production-Cluster-Configuration.html +* "Production Hadoop Configuration":Hadoop-Configuration.html * "Rolling Cluster Updates":Rolling-Updates.html h2. Configuration