mirror of https://github.com/apache/druid.git
9.5 KiB
9.5 KiB
layout |
---|
doc_page |
Example Production Hadoop Configuration
The following configuration should work relatively well for Druid indexing and Hadoop. In the example, we are using Hadoop 2.4 with EC2 m1.xlarge nodes for NameNodes and cc2.8xlarge nodes for DataNodes.
Core-site.xml
<configuration>
<!-- Temporary directory on HDFS (but also sometimes local!) -->
<property>
<name>hadoop.tmp.dir</name>
<value>/mnt/persistent/hadoop</value>
</property>
<!-- S3 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://#{IP}:9000</value>
</property>
<property>
<name>fs.s3.impl</name>
<value>org.apache.hadoop.fs.s3native.NativeS3FileSystem</value>
</property>
<property>
<name>fs.s3.awsAccessKeyId</name>
<value>#{S3_ACCESS_KEY}</value>
</property>
<property>
<name>fs.s3.awsSecretAccessKey</name>
<value>#{S3_SECRET_KEY}</value>
</property>
<property>
<name>fs.s3.buffer.dir</name>
<value>/mnt/persistent/hadoop-s3n</value>
</property>
<property>
<name>fs.s3n.awsAccessKeyId</name>
<value>#{S3N_ACCESS_KEY}</value>
</property>
<property>
<name>fs.s3n.awsSecretAccessKey</name>
<value>#{S3N_SECRET_KEY}</value>
</property>
<!-- Compression -->
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.Lz4Codec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<!-- JBOD -->
<property>
<name>io.seqfile.local.dir</name>
<value>/mnt/persistent/hadoop/io/local</value>
</property>
</configuration>
Mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobtracker.address</name>
<value>#{JT_ADDR}:9001</value>
</property>
<property>
<name>mapreduce.jobtracker.http.address</name>
<value>#{JT_HTTP_ADDR}:9100</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>#{JH_ADDR}:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>#{JH_WEBAPP_ADDR}:19888</value>
</property>
<property>
<name>mapreduce.tasktracker.http.address</name>
<value>#{TT_ADDR}:9103</value>
</property>
<!-- Memory and concurrency tuning -->
<property>
<name>mapreduce.job.reduces</name>
<value>21</value>
</property>
<property>
<property>
<name>mapreduce.job.jvm.numtasks</name>
<value>20</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>2048</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-server -Xmx1536m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>6144</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-server -Xmx2560m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.parallelcopies</name>
<value>50</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.input.buffer.percent</name>
<value>0.5</value>
</property>
<property>
<name>mapreduce.task.io.sort.mb</name>
<value>256</value>
</property>
<property>
<name>mapreduce.task.io.sort.factor</name>
<value>100</value>
</property>
<property>
<name>mapreduce.jobtracker.handler.count</name>
<value>64</value>
</property>
<property>
<name>mapreduce.tasktracker.http.threads</name>
<value>20</value>
</property>
<!-- JBOD -->
<property>
<name>mapreduce.cluster.local.dir</name>
<value>/mnt/persistent/hadoop/mapred/local</value>
</property>
<!-- Job history server persistent state -->
<property>
<name>mapreduce.jobhistory.recovery.enable</name>
<value>true</value>
</property>
<property>
<name>mapreduce.jobhistory.recovery.store.class</name>
<value>org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService</value>
</property>
<property>
<name>mapreduce.jobhistory.recovery.store.fs.uri</name>
<value>file://${hadoop.tmp.dir}/mapred-jobhistory-state</value>
</property>
<!-- Compression -->
<property>
<!-- Off by default, because it breaks Druid indexing (at least, it does it druid-0.6.10+). Jobs should turn
it on if they need it. -->
<name>mapreduce.output.fileoutputformat.compress</name>
<value>false</value>
</property>
<property>
<name>mapreduce.map.output.compress</name>
<value>true</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.type</name>
<value>BLOCK</value>
</property>
<property>
<name>mapreduce.map.output.compress.codec</name>
<value>org.apache.hadoop.io.compress.Lz4Codec</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.codec</name>
<value>org.apache.hadoop.io.compress.GzipCodec</value>
</property>
<!-- Speculative execution would violate various assumptions we've made in our system design -->
<property>
<name>mapreduce.map.speculative</name>
<value>false</value>
</property>
<property>
<name>mapreduce.reduce.speculative</name>
<value>false</value>
</property>
<!-- Sometimes jobs take a long time to run, but really, they're okay. Examples: Long index persists,
hadoop reading lots of empty files into a single mapper. Let's increase the timeout to 30 minutes. -->
<property>
<name>mapreduce.task.timeout</name>
<value>1800000</value>
</property>
</configuration>
Yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>#{RM_HOSTNAME}</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log.server.url</name>
<value>http://#{IP_LOG_SERVER}:19888/jobhistory/logs/</value>
</property>
<property>
<name>yarn.nodemanager.hostname</name>
<value>#{IP_ADDR}</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- JBOD -->
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>/mnt/persistent/hadoop/nm-local-dir</value>
</property>
<!-- ResourceManager persistent state doesn't work well in tests yet, so disable it -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
</property>
<property>
<name>yarn.resourcemanager.fs.state-store.uri</name>
<value>file://${hadoop.tmp.dir}/yarn-resourcemanager-state</value>
</property>
<!-- Ability to exclude hosts -->
<property>
<name>yarn.resourcemanager.nodes.exclude-path</name>
<value>/mnt/persistent/hadoop/yarn-exclude.txt</value>
</property>
</configuration>
HDFS-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/mnt/persistent/hadoop/hdfs-exclude.txt</value>
</property>
<!-- JBOD -->
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///mnt/persistent/hadoop/dfs/data</value>
</property>
</configuration>
Capacity-scheduler.xml
<configuration>
<property>
<name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
<value>0.1</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.queues</name>
<value>default</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.capacity</name>
<value>100</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.queues</name>
<value>default</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
<value>100</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.state</name>
<value>RUNNING</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
<value>*</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
<value>*</value>
</property>
<property>
<name>yarn.scheduler.capacity.node-locality-delay</name>
<value>-1</value>
</property>
</configuration>