Xavier Léauté d2346b6834 shorten links and file names
* remove redundant parts in file names
* delete unsupported "Druid-Personal-Demo-Cluster"
2015-05-29 20:55:42 -05:00

360 lines
9.5 KiB
Markdown

---
layout: doc_page
---
Example Production Hadoop Configuration
=======================================
The following configuration should work relatively well for Druid indexing and Hadoop. In the example, we are using Hadoop 2.4 with EC2 m1.xlarge nodes for NameNodes and cc2.8xlarge nodes for DataNodes.
### Core-site.xml
```
<configuration>
<!-- Temporary directory on HDFS (but also sometimes local!) -->
<property>
<name>hadoop.tmp.dir</name>
<value>/mnt/persistent/hadoop</value>
</property>
<!-- S3 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://#{IP}:9000</value>
</property>
<property>
<name>fs.s3.impl</name>
<value>org.apache.hadoop.fs.s3native.NativeS3FileSystem</value>
</property>
<property>
<name>fs.s3.awsAccessKeyId</name>
<value>#{S3_ACCESS_KEY}</value>
</property>
<property>
<name>fs.s3.awsSecretAccessKey</name>
<value>#{S3_SECRET_KEY}</value>
</property>
<property>
<name>fs.s3.buffer.dir</name>
<value>/mnt/persistent/hadoop-s3n</value>
</property>
<property>
<name>fs.s3n.awsAccessKeyId</name>
<value>#{S3N_ACCESS_KEY}</value>
</property>
<property>
<name>fs.s3n.awsSecretAccessKey</name>
<value>#{S3N_SECRET_KEY}</value>
</property>
<!-- Compression -->
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.Lz4Codec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<!-- JBOD -->
<property>
<name>io.seqfile.local.dir</name>
<value>/mnt/persistent/hadoop/io/local</value>
</property>
</configuration>
```
### Mapred-site.xml
```
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobtracker.address</name>
<value>#{JT_ADDR}:9001</value>
</property>
<property>
<name>mapreduce.jobtracker.http.address</name>
<value>#{JT_HTTP_ADDR}:9100</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>#{JH_ADDR}:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>#{JH_WEBAPP_ADDR}:19888</value>
</property>
<property>
<name>mapreduce.tasktracker.http.address</name>
<value>#{TT_ADDR}:9103</value>
</property>
<!-- Memory and concurrency tuning -->
<property>
<name>mapreduce.job.reduces</name>
<value>21</value>
</property>
<property>
<property>
<name>mapreduce.job.jvm.numtasks</name>
<value>20</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>2048</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-server -Xmx1536m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>6144</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-server -Xmx2560m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.parallelcopies</name>
<value>50</value>
</property>
<property>
<name>mapreduce.reduce.shuffle.input.buffer.percent</name>
<value>0.5</value>
</property>
<property>
<name>mapreduce.task.io.sort.mb</name>
<value>256</value>
</property>
<property>
<name>mapreduce.task.io.sort.factor</name>
<value>100</value>
</property>
<property>
<name>mapreduce.jobtracker.handler.count</name>
<value>64</value>
</property>
<property>
<name>mapreduce.tasktracker.http.threads</name>
<value>20</value>
</property>
<!-- JBOD -->
<property>
<name>mapreduce.cluster.local.dir</name>
<value>/mnt/persistent/hadoop/mapred/local</value>
</property>
<!-- Job history server persistent state -->
<property>
<name>mapreduce.jobhistory.recovery.enable</name>
<value>true</value>
</property>
<property>
<name>mapreduce.jobhistory.recovery.store.class</name>
<value>org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService</value>
</property>
<property>
<name>mapreduce.jobhistory.recovery.store.fs.uri</name>
<value>file://${hadoop.tmp.dir}/mapred-jobhistory-state</value>
</property>
<!-- Compression -->
<property>
<!-- Off by default, because it breaks Druid indexing (at least, it does it druid-0.6.10+). Jobs should turn
it on if they need it. -->
<name>mapreduce.output.fileoutputformat.compress</name>
<value>false</value>
</property>
<property>
<name>mapreduce.map.output.compress</name>
<value>true</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.type</name>
<value>BLOCK</value>
</property>
<property>
<name>mapreduce.map.output.compress.codec</name>
<value>org.apache.hadoop.io.compress.Lz4Codec</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.codec</name>
<value>org.apache.hadoop.io.compress.GzipCodec</value>
</property>
<!-- Speculative execution would violate various assumptions we've made in our system design -->
<property>
<name>mapreduce.map.speculative</name>
<value>false</value>
</property>
<property>
<name>mapreduce.reduce.speculative</name>
<value>false</value>
</property>
<!-- Sometimes jobs take a long time to run, but really, they're okay. Examples: Long index persists,
hadoop reading lots of empty files into a single mapper. Let's increase the timeout to 30 minutes. -->
<property>
<name>mapreduce.task.timeout</name>
<value>1800000</value>
</property>
</configuration>
```
### Yarn-site.xml
```
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>#{RM_HOSTNAME}</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log.server.url</name>
<value>http://#{IP_LOG_SERVER}:19888/jobhistory/logs/</value>
</property>
<property>
<name>yarn.nodemanager.hostname</name>
<value>#{IP_ADDR}</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- JBOD -->
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>/mnt/persistent/hadoop/nm-local-dir</value>
</property>
<!-- ResourceManager persistent state doesn't work well in tests yet, so disable it -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
</property>
<property>
<name>yarn.resourcemanager.fs.state-store.uri</name>
<value>file://${hadoop.tmp.dir}/yarn-resourcemanager-state</value>
</property>
<!-- Ability to exclude hosts -->
<property>
<name>yarn.resourcemanager.nodes.exclude-path</name>
<value>/mnt/persistent/hadoop/yarn-exclude.txt</value>
</property>
</configuration>
```
### HDFS-site.xml
```
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/mnt/persistent/hadoop/hdfs-exclude.txt</value>
</property>
<!-- JBOD -->
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///mnt/persistent/hadoop/dfs/data</value>
</property>
</configuration>
```
### Capacity-scheduler.xml
```
<configuration>
<property>
<name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
<value>0.1</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.queues</name>
<value>default</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.capacity</name>
<value>100</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.queues</name>
<value>default</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
<value>100</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.state</name>
<value>RUNNING</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
<value>*</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
<value>*</value>
</property>
<property>
<name>yarn.scheduler.capacity.node-locality-delay</name>
<value>-1</value>
</property>
</configuration>
```