2014-10-08 13:51:00 -04:00
---
layout: doc_page
---
Example Production Hadoop Configuration
=======================================
2014-10-08 13:56:00 -04:00
The following configuration should work relatively well for Druid indexing and Hadoop. In the example, we are using Hadoop 2.4 with EC2 m1.xlarge nodes for NameNodes and cc2.8xlarge nodes for DataNodes.
2014-10-08 13:51:00 -04:00
### Core-site.xml
```
< configuration >
<!-- Temporary directory on HDFS (but also sometimes local!) -->
< property >
< name > hadoop.tmp.dir< / name >
< value > /mnt/persistent/hadoop< / value >
< / property >
<!-- S3 -->
< property >
< name > fs.defaultFS< / name >
< value > hdfs://#{IP}:9000< / value >
< / property >
< property >
< name > fs.s3.impl< / name >
< value > org.apache.hadoop.fs.s3native.NativeS3FileSystem< / value >
< / property >
< property >
< name > fs.s3.awsAccessKeyId< / name >
< value > #{S3_ACCESS_KEY}< / value >
< / property >
< property >
< name > fs.s3.awsSecretAccessKey< / name >
< value > #{S3_SECRET_KEY}< / value >
< / property >
< property >
< name > fs.s3.buffer.dir< / name >
< value > /mnt/persistent/hadoop-s3n< / value >
< / property >
< property >
< name > fs.s3n.awsAccessKeyId< / name >
< value > #{S3N_ACCESS_KEY}< / value >
< / property >
< property >
< name > fs.s3n.awsSecretAccessKey< / name >
< value > #{S3N_SECRET_KEY}< / value >
< / property >
<!-- Compression -->
< property >
< name > io.compression.codecs< / name >
< value > org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.Lz4Codec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec< / value >
< / property >
<!-- JBOD -->
< property >
< name > io.seqfile.local.dir< / name >
< value > /mnt/persistent/hadoop/io/local< / value >
< / property >
< / configuration >
```
### Mapred-site.xml
```
< configuration >
< property >
< name > mapreduce.framework.name< / name >
< value > yarn< / value >
< / property >
< property >
< name > mapreduce.jobtracker.address< / name >
< value > #{JT_ADDR}:9001< / value >
< / property >
< property >
< name > mapreduce.jobtracker.http.address< / name >
< value > #{JT_HTTP_ADDR}:9100< / value >
< / property >
< property >
< name > mapreduce.jobhistory.address< / name >
< value > #{JH_ADDR}:10020< / value >
< / property >
< property >
< name > mapreduce.jobhistory.webapp.address< / name >
< value > #{JH_WEBAPP_ADDR}:19888< / value >
< / property >
< property >
< name > mapreduce.tasktracker.http.address< / name >
< value > #{TT_ADDR}:9103< / value >
< / property >
<!-- Memory and concurrency tuning -->
< property >
< name > mapreduce.job.reduces< / name >
< value > 21< / value >
< / property >
< property >
< property >
< name > mapreduce.job.jvm.numtasks< / name >
< value > 20< / value >
< / property >
< property >
< name > mapreduce.map.memory.mb< / name >
< value > 2048< / value >
< / property >
< property >
< name > mapreduce.map.java.opts< / name >
< value > -server -Xmx1536m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps< / value >
< / property >
< property >
< name > mapreduce.reduce.memory.mb< / name >
< value > 6144< / value >
< / property >
< property >
< name > mapreduce.reduce.java.opts< / name >
< value > -server -Xmx2560m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps< / value >
< / property >
< property >
< name > mapreduce.reduce.shuffle.parallelcopies< / name >
< value > 50< / value >
< / property >
< property >
< name > mapreduce.reduce.shuffle.input.buffer.percent< / name >
< value > 0.5< / value >
< / property >
< property >
< name > mapreduce.task.io.sort.mb< / name >
< value > 256< / value >
< / property >
< property >
< name > mapreduce.task.io.sort.factor< / name >
< value > 100< / value >
< / property >
< property >
< name > mapreduce.jobtracker.handler.count< / name >
< value > 64< / value >
< / property >
< property >
< name > mapreduce.tasktracker.http.threads< / name >
< value > 20< / value >
< / property >
<!-- JBOD -->
< property >
< name > mapreduce.cluster.local.dir< / name >
< value > /mnt/persistent/hadoop/mapred/local< / value >
< / property >
<!-- Job history server persistent state -->
< property >
< name > mapreduce.jobhistory.recovery.enable< / name >
< value > true< / value >
< / property >
< property >
< name > mapreduce.jobhistory.recovery.store.class< / name >
< value > org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService< / value >
< / property >
< property >
< name > mapreduce.jobhistory.recovery.store.fs.uri< / name >
< value > file://${hadoop.tmp.dir}/mapred-jobhistory-state< / value >
< / property >
<!-- Compression -->
< property >
<!-- Off by default, because it breaks Druid indexing (at least, it does it druid - 0.6.10+). Jobs should turn
it on if they need it. -->
< name > mapreduce.output.fileoutputformat.compress< / name >
< value > false< / value >
< / property >
< property >
< name > mapreduce.map.output.compress< / name >
< value > true< / value >
< / property >
< property >
< name > mapreduce.output.fileoutputformat.compress.type< / name >
< value > BLOCK< / value >
< / property >
< property >
< name > mapreduce.map.output.compress.codec< / name >
< value > org.apache.hadoop.io.compress.Lz4Codec< / value >
< / property >
< property >
< name > mapreduce.output.fileoutputformat.compress.codec< / name >
< value > org.apache.hadoop.io.compress.GzipCodec< / value >
< / property >
<!-- Speculative execution would violate various assumptions we've made in our system design -->
< property >
< name > mapreduce.map.speculative< / name >
< value > false< / value >
< / property >
< property >
< name > mapreduce.reduce.speculative< / name >
< value > false< / value >
< / property >
<!-- Sometimes jobs take a long time to run, but really, they're okay. Examples: Long index persists,
hadoop reading lots of empty files into a single mapper. Let's increase the timeout to 30 minutes. -->
< property >
< name > mapreduce.task.timeout< / name >
< value > 1800000< / value >
< / property >
< / configuration >
```
### Yarn-site.xml
```
< configuration >
< property >
< name > yarn.resourcemanager.hostname< / name >
< value > #{RM_HOSTNAME}< / value >
< / property >
< property >
< name > yarn.resourcemanager.scheduler.class< / name >
< value > org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler< / value >
< / property >
< property >
< name > yarn.nodemanager.aux-services< / name >
< value > mapreduce_shuffle< / value >
< / property >
< property >
< name > yarn.log-aggregation-enable< / name >
< value > true< / value >
< / property >
< property >
< name > yarn.log.server.url< / name >
< value > http://#{IP_LOG_SERVER}:19888/jobhistory/logs/< / value >
< / property >
< property >
< name > yarn.nodemanager.hostname< / name >
< value > #{IP_ADDR}< / value >
< / property >
< property >
< name > yarn.scheduler.minimum-allocation-mb< / name >
< value > 512< / value >
< / property >
< property >
< name > yarn.nodemanager.resource.memory-mb< / name >
< value > 1024< / value >
< / property >
< property >
< name > yarn.nodemanager.resource.cpu-vcores< / name >
< value > 1< / value >
< / property >
< property >
< name > yarn.nodemanager.vmem-check-enabled< / name >
< value > false< / value >
< / property >
<!-- JBOD -->
< property >
< name > yarn.nodemanager.local-dirs< / name >
< value > /mnt/persistent/hadoop/nm-local-dir< / value >
< / property >
<!-- ResourceManager persistent state doesn't work well in tests yet, so disable it -->
< property >
< name > yarn.resourcemanager.recovery.enabled< / name >
< value > false< / value >
< / property >
< property >
< name > yarn.resourcemanager.store.class< / name >
< value > org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore< / value >
< / property >
< property >
< name > yarn.resourcemanager.fs.state-store.uri< / name >
< value > file://${hadoop.tmp.dir}/yarn-resourcemanager-state< / value >
< / property >
<!-- Ability to exclude hosts -->
< property >
< name > yarn.resourcemanager.nodes.exclude-path< / name >
< value > /mnt/persistent/hadoop/yarn-exclude.txt< / value >
< / property >
< / configuration >
```
### HDFS-site.xml
```
< configuration >
< property >
< name > dfs.replication< / name >
< value > 3< / value >
< / property >
< property >
< name > dfs.namenode.datanode.registration.ip-hostname-check< / name >
< value > false< / value >
< / property >
< property >
< name > dfs.hosts.exclude< / name >
< value > /mnt/persistent/hadoop/hdfs-exclude.txt< / value >
< / property >
<!-- JBOD -->
< property >
< name > dfs.datanode.data.dir< / name >
< value > file:///mnt/persistent/hadoop/dfs/data< / value >
< / property >
< / configuration >
```
### Capacity-scheduler.xml
```
< configuration >
< property >
< name > yarn.scheduler.capacity.maximum-am-resource-percent< / name >
< value > 0.1< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.queues< / name >
< value > default< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.default.capacity< / name >
< value > 100< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.default.user-limit-factor< / name >
< value > 1< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.queues< / name >
< value > default< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.default.maximum-capacity< / name >
< value > 100< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.default.state< / name >
< value > RUNNING< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.default.acl_submit_applications< / name >
< value > *< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.root.default.acl_administer_queue< / name >
< value > *< / value >
< / property >
< property >
< name > yarn.scheduler.capacity.node-locality-delay< / name >
< value > -1< / value >
< / property >
< / configuration >
```