--- layout: doc_page --- Example Production Hadoop Configuration ======================================= The following configuration should work relatively well for Druid indexing and Hadoop. In the example, we are using Hadoop 2.4 with EC2 m1.xlarge nodes for NameNodes and cc2.8xlarge nodes for DataNodes. ### Core-site.xml ``` hadoop.tmp.dir /mnt/persistent/hadoop fs.defaultFS hdfs://#{IP}:9000 fs.s3.impl org.apache.hadoop.fs.s3native.NativeS3FileSystem fs.s3.awsAccessKeyId #{S3_ACCESS_KEY} fs.s3.awsSecretAccessKey #{S3_SECRET_KEY} fs.s3.buffer.dir /mnt/persistent/hadoop-s3n fs.s3n.awsAccessKeyId #{S3N_ACCESS_KEY} fs.s3n.awsSecretAccessKey #{S3N_SECRET_KEY} io.compression.codecs org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.Lz4Codec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec io.seqfile.local.dir /mnt/persistent/hadoop/io/local ``` ### Mapred-site.xml ``` mapreduce.framework.name yarn mapreduce.jobtracker.address #{JT_ADDR}:9001 mapreduce.jobtracker.http.address #{JT_HTTP_ADDR}:9100 mapreduce.jobhistory.address #{JH_ADDR}:10020 mapreduce.jobhistory.webapp.address #{JH_WEBAPP_ADDR}:19888 mapreduce.tasktracker.http.address #{TT_ADDR}:9103 mapreduce.job.reduces 21 mapreduce.job.jvm.numtasks 20 mapreduce.map.memory.mb 2048 mapreduce.map.java.opts -server -Xmx1536m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps mapreduce.reduce.memory.mb 6144 mapreduce.reduce.java.opts -server -Xmx2560m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps mapreduce.reduce.shuffle.parallelcopies 50 mapreduce.reduce.shuffle.input.buffer.percent 0.5 mapreduce.task.io.sort.mb 256 mapreduce.task.io.sort.factor 100 mapreduce.jobtracker.handler.count 64 mapreduce.tasktracker.http.threads 20 mapreduce.cluster.local.dir /mnt/persistent/hadoop/mapred/local mapreduce.jobhistory.recovery.enable true mapreduce.jobhistory.recovery.store.class org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService mapreduce.jobhistory.recovery.store.fs.uri file://${hadoop.tmp.dir}/mapred-jobhistory-state mapreduce.output.fileoutputformat.compress false mapreduce.map.output.compress true mapreduce.output.fileoutputformat.compress.type BLOCK mapreduce.map.output.compress.codec org.apache.hadoop.io.compress.Lz4Codec mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.GzipCodec mapreduce.map.speculative false mapreduce.reduce.speculative false mapreduce.task.timeout 1800000 ``` ### Yarn-site.xml ``` yarn.resourcemanager.hostname #{RM_HOSTNAME} yarn.resourcemanager.scheduler.class org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler yarn.nodemanager.aux-services mapreduce_shuffle yarn.log-aggregation-enable true yarn.log.server.url http://#{IP_LOG_SERVER}:19888/jobhistory/logs/ yarn.nodemanager.hostname #{IP_ADDR} yarn.scheduler.minimum-allocation-mb 512 yarn.nodemanager.resource.memory-mb 1024 yarn.nodemanager.resource.cpu-vcores 1 yarn.nodemanager.vmem-check-enabled false yarn.nodemanager.local-dirs /mnt/persistent/hadoop/nm-local-dir yarn.resourcemanager.recovery.enabled false yarn.resourcemanager.store.class org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore yarn.resourcemanager.fs.state-store.uri file://${hadoop.tmp.dir}/yarn-resourcemanager-state yarn.resourcemanager.nodes.exclude-path /mnt/persistent/hadoop/yarn-exclude.txt ``` ### HDFS-site.xml ``` dfs.replication 3 dfs.namenode.datanode.registration.ip-hostname-check false dfs.hosts.exclude /mnt/persistent/hadoop/hdfs-exclude.txt dfs.datanode.data.dir file:///mnt/persistent/hadoop/dfs/data ``` ### Capacity-scheduler.xml ``` yarn.scheduler.capacity.maximum-am-resource-percent 0.1 yarn.scheduler.capacity.root.queues default yarn.scheduler.capacity.root.default.capacity 100 yarn.scheduler.capacity.root.default.user-limit-factor 1 yarn.scheduler.capacity.root.queues default yarn.scheduler.capacity.root.default.maximum-capacity 100 yarn.scheduler.capacity.root.default.state RUNNING yarn.scheduler.capacity.root.default.acl_submit_applications * yarn.scheduler.capacity.root.default.acl_administer_queue * yarn.scheduler.capacity.node-locality-delay -1 ```