Merge r1440222 through r1441205 from trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-2802@1441206 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2013-01-31 21:39:42 +00:00
commit 40df526bd3
58 changed files with 3574 additions and 3630 deletions

View File

@ -325,6 +325,9 @@ Trunk (Unreleased)
HADOOP-9249. hadoop-maven-plugins version-info goal causes build failure
when running with Clover. (Chris Nauroth via suresh)
HADOOP-9264. Port change to use Java untar API on Windows from
branch-1-win to trunk. (Chris Nauroth via suresh)
OPTIMIZATIONS
HADOOP-7761. Improve the performance of raw comparisons. (todd)
@ -586,6 +589,10 @@ Release 2.0.3-alpha - Unreleased
HADOOP-8857. hadoop.http.authentication.signature.secret.file docs
should not state that secret is randomly generated. (tucu)
HADOOP-9221. Convert remaining xdocs to APT. (Andy Isaacson via atm)
HADOOP-8981. TestMetricsSystemImpl fails on Windows. (Xuan Gong via suresh)
Release 2.0.2-alpha - 2012-09-07
INCOMPATIBLE CHANGES

View File

@ -241,6 +241,11 @@
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.4</version>
</dependency>
</dependencies>
<build>
@ -381,6 +386,23 @@
</target>
</configuration>
</execution>
<execution>
<id>copy-test-tarballs</id>
<phase>process-test-resources</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<copy toDir="${test.cache.data}">
<fileset dir="${basedir}/src/test/java/org/apache/hadoop/fs">
<include name="test-untar.tar"/>
<include name="test-untar.tgz"/>
</fileset>
</copy>
</target>
</configuration>
</execution>
<execution>
<phase>pre-site</phase>
<goals>
@ -485,6 +507,7 @@
<exclude>src/test/all-tests</exclude>
<exclude>src/test/resources/kdc/ldif/users.ldif</exclude>
<exclude>src/main/native/src/org/apache/hadoop/io/compress/lz4/lz4.c</exclude>
<exclude>src/test/java/org/apache/hadoop/fs/test-untar.tgz</exclude>
</excludes>
</configuration>
</plugin>

View File

@ -1,109 +0,0 @@
<?xml version="1.0"?>
<!--
Copyright 2002-2004 The Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
"http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>
Superusers Acting On Behalf Of Other Users
</title>
</header>
<body>
<section>
<title> Introduction </title>
<p>
This document describes how a superuser can submit jobs or access hdfs on behalf of another user in a secured way.
</p>
</section>
<section>
<title> Use Case </title>
<p>
The code example described in the next section is applicable for the following use case.
</p>
<p>
A superuser with username 'super' wants to submit job and access hdfs on behalf of a user joe. The superuser has kerberos credentials but user joe doesn't have any. The tasks are required to run as user joe and any file accesses on namenode are required to be done as user joe. It is required that user joe can connect to the namenode or job tracker on a connection authenticated with super's kerberos credentials. In other words super is impersonating the user joe.
</p>
</section>
<section>
<title> Code example </title>
<p>
In this example super's kerberos credentials are used for login and a proxy user ugi object is created for joe. The operations are performed within the doAs method of this proxy user ugi object.
</p>
<source>
...
//Create ugi for joe. The login user is 'super'.
UserGroupInformation ugi =
UserGroupInformation.createProxyUser("joe", UserGroupInformation.getLoginUser());
ugi.doAs(new PrivilegedExceptionAction&lt;Void&gt;() {
public Void run() throws Exception {
//Submit a job
JobClient jc = new JobClient(conf);
jc.submitJob(conf);
//OR access hdfs
FileSystem fs = FileSystem.get(conf);
fs.mkdir(someFilePath);
}
}
</source>
</section>
<section>
<title> Configurations </title>
<p>
The superuser must be configured on namenode and jobtracker to be allowed to impersonate another user. Following configurations are required.
</p>
<source>
&lt;property&gt;
&lt;name&gt;hadoop.proxyuser.super.groups&lt;/name&gt;
&lt;value&gt;group1,group2&lt;/value&gt;
&lt;description&gt;Allow the superuser super to impersonate any members of the group group1 and group2&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;hadoop.proxyuser.super.hosts&lt;/name&gt;
&lt;value&gt;host1,host2&lt;/value&gt;
&lt;description&gt;The superuser can connect only from host1 and host2 to impersonate a user&lt;/description&gt;
&lt;/property&gt;
</source>
<p>
If these configurations are not present, impersonation will not be allowed and connection will fail.
</p>
<p>
If more lax security is preferred, the wildcard value <code>*</code> may be used to allow impersonation from any host or of any user.
</p>
</section>
<section>
<title> Caveats </title>
<p>
The superuser must have kerberos credentials to be able to impersonate another user. It cannot use delegation tokens for this feature. It would be wrong if superuser adds its own delegation token to the proxy user ugi, as it will allow the proxy user to connect to the service with the privileges of the superuser.
</p>
<p>
However, if the superuser does want to give a delegation token to joe, it must first impersonate joe and get a delegation token for joe, in the same way as the code example above, and add it to the ugi of joe. In this way the delegation token will have the owner as joe.
</p>
</section>
</body>
</document>

View File

@ -1,147 +0,0 @@
<?xml version="1.0"?>
<!--
Copyright 2002-2004 The Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
"http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>
Hadoop Deployment Layout
</title>
</header>
<body>
<section>
<title> Introduction </title>
<p>
This document describes the standard deployment layout for Hadoop. With increased complexity and evolving Hadoop ecosystem, having standard deployment layout ensures better integration between Hadoop sub-projects. By making the installation process easier, we can lower the barrier to entry and increase Hadoop adoption.
</p>
</section>
<section>
<title> Packages </title>
<p>
We need to divide Hadoop up into packages that can be independently upgraded. The list of packages should include:
</p>
<ul>
<li>Hadoop Common - Common including the native code and required jar files.</li>
<li>HDFS Client - HDFS jars, scripts, and shared libraries.</li>
<li>HDFS Server - jsvc executable</li>
<li>Yarn Client - Yarn client jars and scripts</li>
<li>Yarn Server - Yarn server jars and scripts</li>
<li>MapReduce - MapReduce jars, scripts, and shared libraries</li>
<li>LZO - LZ0 codec from github.com/omally/hadoop-gpl-compression</li>
<li>Metrics - Plugins for Chukwa and Ganglia</li>
</ul>
<p>Packages from other teams will include:</p>
<ul>
<li>Pig</li>
<li>Hive</li>
<li>Oozie client</li>
<li>Oozie server</li>
<li>Howl client</li>
<li>Howl server</li>
</ul>
<p>These packages should be deployable with RPM on RedHat. We also need a package that depends on a version of each of these packages. In general, we can generate tarballs in the new deployment layout.</p>
<p>Note that some packages, like Pig, which are user facing, will have 2 versions installed in a given deployment. This will be accomplished by modifying the package name and the associated binaries to include the version number.</p>
<p>All of the following paths are based on a prefix directory that is the root of the installation. Our packages must support having multiple Hadoop stack installation on a computer at the same time. For RPMs, this means that the packages must be relocatable and honor the --prefix option.</p>
</section>
<section>
<title> Deployment </title>
<p>It is important to have a standard deployment that results from installing the packages regardless of the package manager. Here are the top level directories and a sample of what would be under each. Note that all of the packages are installed "flattened" into the prefix directory. For compatibility reasons, we should create "share/hadoop" that matches the old HADOOP_PREFIX and set the HADOOP_PREFIX variable to that.</p>
<source>
$PREFIX/ bin / hadoop
| | mapred
| | pig -> pig7
| | pig6
| + pig7
|
+ etc / hadoop / core-site.xml
| | hdfs-site.xml
| + mapred-site.xml
|
+ include / hadoop / Pipes.hh
| | + TemplateFactory.hh
| + hdfs.h
|
+ lib / jni / hadoop-common / libhadoop.so.0.20.0
| |
| | libhdfs.so -> libhdfs.so.0.20.0
| + libhdfs.so.0.20.0
|
+ libexec / task-controller
|
+ man / man1 / hadoop.1
| | mapred.1
| | pig6.1
| + pig7.1
|
+ share / hadoop-common
| | hadoop-hdfs
| | hadoop-mapreduce
| | pig6
| + pig7
|
+ sbin / hdfs-admin
| | mapred-admin
|
+ src / hadoop-common
| | hadoop-hdfs
| + hadoop-mapreduce
|
+ var / lib / data-node
| + task-tracker
|
| log / hadoop-datanode
| + hadoop-tasktracker
|
+ run / hadoop-datanode.pid
+ hadoop-tasktracker.pid
</source>
<p>Note that we must continue to honor HADOOP_CONF_DIR to override the configuration location, but that it should default to $prefix/etc. User facing binaries and scripts go into bin. Configuration files go into etc with multiple configuration files having a directory. JNI shared libraries go into lib/jni/$tool since Java does not allow to specify the version of the library to load. Libraries that aren't loaded via System.loadLibrary are placed directly under lib. 64 bit versions of the libraries for platforms that support them should be placed in lib64. All of the architecture-independent pieces, including the jars for each tool will be placed in share/$tool. The default location for all the run time information will be in var. The storage will be in var/lib, the logs in var/log and the pid files in var/run.</p>
</section>
<section>
<title> Path Configurations </title>
<p>Path can be configured at compile phase or installation phase. For RPM, it takes advantage of the --relocate directive to allow path reconfiguration at install phase. For Debian package, path is configured at compile phase.
</p>
<p>Build phase parameter:</p>
<ul>
<li>package.prefix - Location of package prefix (Default /usr)</li>
<li>package.conf.dir - Location of configuration directory (Default /etc/hadoop)</li>
<li>package.log.dir - Location of log directory (Default /var/log/hadoop)</li>
<li>package.pid.dir - Location of pid directory (Default /var/run/hadoop)</li>
</ul>
<p>Install phase parameter:</p>
<source>
rpm -i hadoop-[version]-[rev].[arch].rpm \
--relocate /usr=/usr/local/hadoop \
--relocate /etc/hadoop=/usr/local/etc/hadoop \
--relocate /var/log/hadoop=/opt/logs/hadoop \
--relocate /var/run/hadoop=/opt/run/hadoop
</source>
</section>
</body>
</document>

View File

@ -1,232 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>Native Libraries Guide</title>
</header>
<body>
<section>
<title>Overview</title>
<p>This guide describes the native hadoop library and includes a small discussion about native shared libraries.</p>
<p><strong>Note:</strong> Depending on your environment, the term "native libraries" <em>could</em>
refer to all *.so's you need to compile; and, the term "native compression" <em>could</em> refer to all *.so's
you need to compile that are specifically related to compression.
Currently, however, this document only addresses the native hadoop library (<em>libhadoop.so</em>).</p>
</section>
<section>
<title>Native Hadoop Library </title>
<p>Hadoop has native implementations of certain components for
performance reasons and for non-availability of Java implementations. These
components are available in a single, dynamically-linked native library called
the native hadoop library. On the *nix platforms the library is named <em>libhadoop.so</em>. </p>
<section>
<title>Usage</title>
<p>It is fairly easy to use the native hadoop library:</p>
<ol>
<li>
Review the <a href="#Components">components</a>.
</li>
<li>
Review the <a href="#Supported+Platforms">supported platforms</a>.
</li>
<li>
Either <a href="#Download">download</a> a hadoop release, which will
include a pre-built version of the native hadoop library, or
<a href="#Build">build</a> your own version of the
native hadoop library. Whether you download or build, the name for the library is
the same: <em>libhadoop.so</em>
</li>
<li>
Install the compression codec development packages
(<strong>&gt;zlib-1.2</strong>, <strong>&gt;gzip-1.2</strong>):
<ul>
<li>If you download the library, install one or more development packages -
whichever compression codecs you want to use with your deployment.</li>
<li>If you build the library, it is <strong>mandatory</strong>
to install both development packages.</li>
</ul>
</li>
<li>
Check the <a href="#Runtime">runtime</a> log files.
</li>
</ol>
</section>
<section>
<title>Components</title>
<p>The native hadoop library includes two components, the zlib and gzip
<a href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/compress/CompressionCodec.html">
compression codecs</a>:
</p>
<ul>
<li><a href="ext:zlib">zlib</a></li>
<li><a href="ext:gzip">gzip</a></li>
</ul>
<p>The native hadoop library is imperative for gzip to work.</p>
</section>
<section>
<title>Supported Platforms</title>
<p>The native hadoop library is supported on *nix platforms only.
The library does not to work with <a href="ext:cygwin">Cygwin</a>
or the <a href="ext:osx">Mac OS X</a> platform.</p>
<p>The native hadoop library is mainly used on the GNU/Linus platform and
has been tested on these distributions:</p>
<ul>
<li>
<a href="http://www.redhat.com/rhel/">RHEL4</a>/<a href="http://fedora.redhat.com/">Fedora</a>
</li>
<li><a href="http://www.ubuntu.com/">Ubuntu</a></li>
<li><a href="http://www.gentoo.org/">Gentoo</a></li>
</ul>
<p>On all the above distributions a 32/64 bit native hadoop library will work
with a respective 32/64 bit jvm.</p>
</section>
<section>
<title>Download</title>
<p>The pre-built 32-bit i386-Linux native hadoop library is available as part of the
hadoop distribution and is located in the <code>lib/native</code> directory. You can download the
hadoop distribution from <a href="ext:releases/download">Hadoop Common Releases</a>.</p>
<p>Be sure to install the zlib and/or gzip development packages - whichever compression
codecs you want to use with your deployment.</p>
</section>
<section>
<title>Build</title>
<p>The native hadoop library is written in <a href="http://en.wikipedia.org/wiki/ANSI_C">ANSI C</a>
and is built using the GNU autotools-chain (autoconf, autoheader, automake, autoscan, libtool).
This means it should be straight-forward to build the library on any platform with a standards-compliant
C compiler and the GNU autotools-chain (see the <a href="#Supported+Platforms">supported platforms</a>).</p>
<p>The packages you need to install on the target platform are:</p>
<ul>
<li>
C compiler (e.g. <a href="http://gcc.gnu.org/">GNU C Compiler</a>)
</li>
<li>
GNU Autools Chain:
<a href="http://www.gnu.org/software/autoconf/">autoconf</a>,
<a href="http://www.gnu.org/software/automake/">automake</a>,
<a href="http://www.gnu.org/software/libtool/">libtool</a>
</li>
<li>
zlib-development package (stable version >= 1.2.0)
</li>
</ul>
<p>Once you installed the prerequisite packages use the standard hadoop <code>build.xml</code>
file and pass along the <code>compile.native</code> flag (set to <code>true</code>) to build the native hadoop library:</p>
<p><code>$ ant -Dcompile.native=true &lt;target&gt;</code></p>
<p>You should see the newly-built library in:</p>
<p><code>$ build/native/&lt;platform&gt;/lib</code></p>
<p>where &lt;<code>platform</code>&gt; is a combination of the system-properties:
<code>${os.name}-${os.arch}-${sun.arch.data.model}</code> (for example, Linux-i386-32).</p>
<p>Please note the following:</p>
<ul>
<li>
It is <strong>mandatory</strong> to install both the zlib and gzip
development packages on the target platform in order to build the
native hadoop library; however, for deployment it is sufficient to
install just one package if you wish to use only one codec.
</li>
<li>
It is necessary to have the correct 32/64 libraries for zlib,
depending on the 32/64 bit jvm for the target platform, in order to
build and deploy the native hadoop library.
</li>
</ul>
</section>
<section>
<title>Runtime</title>
<p>The <code>bin/hadoop</code> script ensures that the native hadoop
library is on the library path via the system property: <br/>
<em>-Djava.library.path=&lt;path&gt;</em></p>
<p>During runtime, check the hadoop log files for your MapReduce tasks.</p>
<ul>
<li>If everything is all right, then:<br/><br/>
<code> DEBUG util.NativeCodeLoader - Trying to load the custom-built native-hadoop library... </code><br/>
<code> INFO util.NativeCodeLoader - Loaded the native-hadoop library </code><br/>
</li>
<li>If something goes wrong, then:<br/><br/>
<code>
INFO util.NativeCodeLoader - Unable to load native-hadoop library for
your platform... using builtin-java classes where applicable
</code>
</li>
</ul>
</section>
</section>
<section>
<title>Native Shared Libraries</title>
<p>You can load <strong>any</strong> native shared library using
<a href="http://hadoop.apache.org/mapreduce/docs/current/mapred_tutorial.html#DistributedCache">DistributedCache</a>
for <em>distributing</em> and <em>symlinking</em> the library files.</p>
<p>This example shows you how to distribute a shared library, <code>mylib.so</code>,
and load it from a MapReduce task.</p>
<ol>
<li> First copy the library to the HDFS: <br/>
<code>bin/hadoop fs -copyFromLocal mylib.so.1 /libraries/mylib.so.1</code>
</li>
<li> The job launching program should contain the following: <br/>
<code> DistributedCache.createSymlink(conf); </code> <br/>
<code> DistributedCache.addCacheFile("hdfs://host:port/libraries/mylib.so.1#mylib.so", conf);
</code>
</li>
<li> The MapReduce task can contain: <br/>
<code> System.loadLibrary("mylib.so"); </code>
</li>
</ol>
<p><br/><strong>Note:</strong> If you downloaded or built the native hadoop library, you dont need to use DistibutedCache to
make the library available to your MapReduce tasks.</p>
</section>
</body>
</document>

View File

@ -1,222 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>Service Level Authorization Guide</title>
</header>
<body>
<section>
<title>Purpose</title>
<p>This document describes how to configure and manage <em>Service Level
Authorization</em> for Hadoop.</p>
</section>
<section>
<title>Prerequisites</title>
<p>Make sure Hadoop is installed, configured and setup correctly. For more information see: </p>
<ul>
<li>
<a href="single_node_setup.html">Single Node Setup</a> for first-time users.
</li>
<li>
<a href="cluster_setup.html">Cluster Setup</a> for large, distributed clusters.
</li>
</ul>
</section>
<section>
<title>Overview</title>
<p>Service Level Authorization is the initial authorization mechanism to
ensure clients connecting to a particular Hadoop <em>service</em> have the
necessary, pre-configured, permissions and are authorized to access the given
service. For example, a MapReduce cluster can use this mechanism to allow a
configured list of users/groups to submit jobs.</p>
<p>The <code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> configuration file
is used to define the access control lists for various Hadoop services.</p>
<p>Service Level Authorization is performed much before to other access
control checks such as file-permission checks, access control on job queues
etc.</p>
</section>
<section>
<title>Configuration</title>
<p>This section describes how to configure service-level authorization
via the configuration file <code>{HADOOP_CONF_DIR}/hadoop-policy.xml</code>.
</p>
<section>
<title>Enable Service Level Authorization</title>
<p>By default, service-level authorization is disabled for Hadoop. To
enable it set the configuration property
<code>hadoop.security.authorization</code> to <strong>true</strong>
in <code>${HADOOP_CONF_DIR}/core-site.xml</code>.</p>
</section>
<section>
<title>Hadoop Services and Configuration Properties</title>
<p>This section lists the various Hadoop services and their configuration
knobs:</p>
<table>
<tr>
<th>Property</th>
<th>Service</th>
</tr>
<tr>
<td><code>security.client.protocol.acl</code></td>
<td>ACL for ClientProtocol, which is used by user code via the
DistributedFileSystem.</td>
</tr>
<tr>
<td><code>security.client.datanode.protocol.acl</code></td>
<td>ACL for ClientDatanodeProtocol, the client-to-datanode protocol
for block recovery.</td>
</tr>
<tr>
<td><code>security.datanode.protocol.acl</code></td>
<td>ACL for DatanodeProtocol, which is used by datanodes to
communicate with the namenode.</td>
</tr>
<tr>
<td><code>security.inter.datanode.protocol.acl</code></td>
<td>ACL for InterDatanodeProtocol, the inter-datanode protocol
for updating generation timestamp.</td>
</tr>
<tr>
<td><code>security.namenode.protocol.acl</code></td>
<td>ACL for NamenodeProtocol, the protocol used by the secondary
namenode to communicate with the namenode.</td>
</tr>
<tr>
<td><code>security.refresh.policy.protocol.acl</code></td>
<td>ACL for RefreshAuthorizationPolicyProtocol, used by the
dfsadmin and mradmin commands to refresh the security policy in-effect.
</td>
</tr>
<tr>
<td><code>security.ha.service.protocol.acl</code></td>
<td>ACL for HAService protocol used by HAAdmin to manage the
active and stand-by states of namenode.
</td>
</tr>
</table>
</section>
<section>
<title>Access Control Lists</title>
<p><code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> defines an access
control list for each Hadoop service. Every access control list has a
simple format:</p>
<p>The list of users and groups are both comma separated list of names.
The two lists are separated by a space.</p>
<p>Example: <code>user1,user2 group1,group2</code>.</p>
<p>Add a blank at the beginning of the line if only a list of groups
is to be provided, equivalently a comman-separated list of users followed
by a space or nothing implies only a set of given users.</p>
<p>A special value of <strong>*</strong> implies that all users are
allowed to access the service.</p>
</section>
<section>
<title>Refreshing Service Level Authorization Configuration</title>
<p>The service-level authorization configuration for the NameNode and
JobTracker can be changed without restarting either of the Hadoop master
daemons. The cluster administrator can change
<code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> on the master nodes and
instruct the NameNode and JobTracker to reload their respective
configurations via the <em>-refreshServiceAcl</em> switch to
<em>dfsadmin</em> and <em>mradmin</em> commands respectively.</p>
<p>Refresh the service-level authorization configuration for the
NameNode:</p>
<p>
<code>$ bin/hadoop dfsadmin -refreshServiceAcl</code>
</p>
<p>Refresh the service-level authorization configuration for the
JobTracker:</p>
<p>
<code>$ bin/hadoop mradmin -refreshServiceAcl</code>
</p>
<p>Of course, one can use the
<code>security.refresh.policy.protocol.acl</code> property in
<code>${HADOOP_CONF_DIR}/hadoop-policy.xml</code> to restrict access to
the ability to refresh the service-level authorization configuration to
certain users/groups.</p>
</section>
<section>
<title>Examples</title>
<p>Allow only users <code>alice</code>, <code>bob</code> and users in the
<code>mapreduce</code> group to submit jobs to the MapReduce cluster:</p>
<source>
&lt;property&gt;
&lt;name&gt;security.job.submission.protocol.acl&lt;/name&gt;
&lt;value&gt;alice,bob mapreduce&lt;/value&gt;
&lt;/property&gt;
</source>
<p></p><p>Allow only DataNodes running as the users who belong to the
group <code>datanodes</code> to communicate with the NameNode:</p>
<source>
&lt;property&gt;
&lt;name&gt;security.datanode.protocol.acl&lt;/name&gt;
&lt;value&gt;datanodes&lt;/value&gt;
&lt;/property&gt;
</source>
<p></p><p>Allow any user to talk to the HDFS cluster as a DFSClient:</p>
<source>
&lt;property&gt;
&lt;name&gt;security.client.protocol.acl&lt;/name&gt;
&lt;value&gt;*&lt;/value&gt;
&lt;/property&gt;
</source>
</section>
</section>
</body>
</document>

View File

@ -1,293 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>Single Node Setup</title>
</header>
<body>
<section>
<title>Purpose</title>
<p>This document describes how to set up and configure a single-node Hadoop
installation so that you can quickly perform simple operations using Hadoop
MapReduce and the Hadoop Distributed File System (HDFS).</p>
</section>
<section id="PreReqs">
<title>Prerequisites</title>
<section>
<title>Supported Platforms</title>
<ul>
<li>
GNU/Linux is supported as a development and production platform.
Hadoop has been demonstrated on GNU/Linux clusters with 2000 nodes.
</li>
<li>
Win32 is supported as a <em>development platform</em>. Distributed
operation has not been well tested on Win32, so it is not
supported as a <em>production platform</em>.
</li>
</ul>
</section>
<section>
<title>Required Software</title>
<p>Required software for Linux and Windows include:</p>
<ol>
<li>
Java<sup>TM</sup> 1.6.x, preferably from Sun, must be installed.
</li>
<li>
<strong>ssh</strong> must be installed and <strong>sshd</strong> must
be running to use the Hadoop scripts that manage remote Hadoop
daemons.
</li>
</ol>
<p>Additional requirements for Windows include:</p>
<ol>
<li>
<a href="http://www.cygwin.com/">Cygwin</a> - Required for shell
support in addition to the required software above.
</li>
</ol>
</section>
<section>
<title>Installing Software</title>
<p>If your cluster doesn't have the requisite software you will need to
install it.</p>
<p>For example on Ubuntu Linux:</p>
<p>
<code>$ sudo apt-get install ssh</code><br/>
<code>$ sudo apt-get install rsync</code>
</p>
<p>On Windows, if you did not install the required software when you
installed cygwin, start the cygwin installer and select the packages:</p>
<ul>
<li>openssh - the <em>Net</em> category</li>
</ul>
</section>
</section>
<section id="Download">
<title>Download</title>
<p>
To get a Hadoop distribution, download a recent
<a href="ext:releases">stable release</a> from one of the Apache Download
Mirrors.
</p>
</section>
<section>
<title>Prepare to Start the Hadoop Cluster</title>
<p>
Unpack the downloaded Hadoop distribution. In the distribution, edit the
file <code>conf/hadoop-env.sh</code> to define at least
<code>JAVA_HOME</code> to be the root of your Java installation.
</p>
<p>
Try the following command:<br/>
<code>$ bin/hadoop</code><br/>
This will display the usage documentation for the <strong>hadoop</strong>
script.
</p>
<p>Now you are ready to start your Hadoop cluster in one of the three supported
modes:
</p>
<ul>
<li>Local (Standalone) Mode</li>
<li>Pseudo-Distributed Mode</li>
<li>Fully-Distributed Mode</li>
</ul>
</section>
<section id="Local">
<title>Standalone Operation</title>
<p>By default, Hadoop is configured to run in a non-distributed
mode, as a single Java process. This is useful for debugging.</p>
<p>
The following example copies the unpacked <code>conf</code> directory to
use as input and then finds and displays every match of the given regular
expression. Output is written to the given <code>output</code> directory.
<br/>
<code>$ mkdir input</code><br/>
<code>$ cp conf/*.xml input</code><br/>
<code>
$ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
</code><br/>
<code>$ cat output/*</code>
</p>
</section>
<section id="PseudoDistributed">
<title>Pseudo-Distributed Operation</title>
<p>Hadoop can also be run on a single-node in a pseudo-distributed mode
where each Hadoop daemon runs in a separate Java process.</p>
<section>
<title>Configuration</title>
<p>Use the following:
<br/><br/>
<code>conf/core-site.xml</code>:</p>
<source>
&lt;configuration&gt;
&lt;property&gt;
&lt;name&gt;fs.defaultFS&lt;/name&gt;
&lt;value&gt;hdfs://localhost:9000&lt;/value&gt;
&lt;/property&gt;
&lt;/configuration&gt;
</source>
<p><br/><code>conf/hdfs-site.xml</code>:</p>
<source>
&lt;configuration&gt;
&lt;property&gt;
&lt;name&gt;dfs.replication&lt;/name&gt;
&lt;value&gt;1&lt;/value&gt;
&lt;/property&gt;
&lt;/configuration&gt;
</source>
<p><br/><code>conf/mapred-site.xml</code>:</p>
<source>
&lt;configuration&gt;
&lt;property&gt;
&lt;name&gt;mapred.job.tracker&lt;/name&gt;
&lt;value&gt;localhost:9001&lt;/value&gt;
&lt;/property&gt;
&lt;/configuration&gt;
</source>
</section>
<section>
<title>Setup passphraseless <em>ssh</em></title>
<p>
Now check that you can ssh to the localhost without a passphrase:<br/>
<code>$ ssh localhost</code>
</p>
<p>
If you cannot ssh to localhost without a passphrase, execute the
following commands:<br/>
<code>$ ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa</code><br/>
<code>$ cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys</code>
</p>
</section>
<section>
<title>Execution</title>
<p>
Format a new distributed-filesystem:<br/>
<code>$ bin/hadoop namenode -format</code>
</p>
<p>
Start the hadoop daemons:<br/>
<code>$ bin/start-all.sh</code>
</p>
<p>The hadoop daemon log output is written to the
<code>${HADOOP_LOG_DIR}</code> directory (defaults to
<code>${HADOOP_PREFIX}/logs</code>).</p>
<p>Browse the web interface for the NameNode and the JobTracker; by
default they are available at:</p>
<ul>
<li>
<code>NameNode</code> -
<a href="http://localhost:50070/">http://localhost:50070/</a>
</li>
<li>
<code>JobTracker</code> -
<a href="http://localhost:50030/">http://localhost:50030/</a>
</li>
</ul>
<p>
Copy the input files into the distributed filesystem:<br/>
<code>$ bin/hadoop fs -put conf input</code>
</p>
<p>
Run some of the examples provided:<br/>
<code>
$ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
</code>
</p>
<p>Examine the output files:</p>
<p>
Copy the output files from the distributed filesystem to the local
filesytem and examine them:<br/>
<code>$ bin/hadoop fs -get output output</code><br/>
<code>$ cat output/*</code>
</p>
<p> or </p>
<p>
View the output files on the distributed filesystem:<br/>
<code>$ bin/hadoop fs -cat output/*</code>
</p>
<p>
When you're done, stop the daemons with:<br/>
<code>$ bin/stop-all.sh</code>
</p>
</section>
</section>
<section id="FullyDistributed">
<title>Fully-Distributed Operation</title>
<p>For information on setting up fully-distributed, non-trivial clusters
see <a href="cluster_setup.html">Cluster Setup</a>.</p>
</section>
<p>
<em>Java and JNI are trademarks or registered trademarks of
Sun Microsystems, Inc. in the United States and other countries.</em>
</p>
</body>
</document>

View File

@ -1128,6 +1128,17 @@ public abstract class FileSystem extends Configured implements Closeable {
public abstract FSDataOutputStream append(Path f, int bufferSize,
Progressable progress) throws IOException;
/**
* Concat existing files together.
* @param trg the path to the target destination.
* @param psrcs the paths to the sources to use for the concatenation.
* @throws IOException
*/
public void concat(final Path trg, final Path [] psrcs) throws IOException {
throw new UnsupportedOperationException("Not implemented by the " +
getClass().getSimpleName() + " FileSystem implementation");
}
/**
* Get replication.
*

View File

@ -21,9 +21,12 @@ package org.apache.hadoop.fs;
import java.io.*;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@ -624,14 +627,28 @@ public class FileUtil {
* @throws IOException
*/
public static void unTar(File inFile, File untarDir) throws IOException {
if (!untarDir.mkdirs()) {
if (!untarDir.mkdirs()) {
if (!untarDir.isDirectory()) {
throw new IOException("Mkdirs failed to create " + untarDir);
}
}
StringBuilder untarCommand = new StringBuilder();
boolean gzipped = inFile.toString().endsWith("gz");
if(Shell.WINDOWS) {
// Tar is not native to Windows. Use simple Java based implementation for
// tests and simple tar archives
unTarUsingJava(inFile, untarDir, gzipped);
}
else {
// spawn tar utility to untar archive for full fledged unix behavior such
// as resolving symlinks in tar archives
unTarUsingTar(inFile, untarDir, gzipped);
}
}
private static void unTarUsingTar(File inFile, File untarDir,
boolean gzipped) throws IOException {
StringBuffer untarCommand = new StringBuffer();
if (gzipped) {
untarCommand.append(" gzip -dc '");
untarCommand.append(FileUtil.makeShellPath(inFile));
@ -656,7 +673,62 @@ public class FileUtil {
". Tar process exited with exit code " + exitcode);
}
}
private static void unTarUsingJava(File inFile, File untarDir,
boolean gzipped) throws IOException {
InputStream inputStream = null;
if (gzipped) {
inputStream = new BufferedInputStream(new GZIPInputStream(
new FileInputStream(inFile)));
} else {
inputStream = new BufferedInputStream(new FileInputStream(inFile));
}
TarArchiveInputStream tis = new TarArchiveInputStream(inputStream);
for (TarArchiveEntry entry = tis.getNextTarEntry(); entry != null;) {
unpackEntries(tis, entry, untarDir);
entry = tis.getNextTarEntry();
}
}
private static void unpackEntries(TarArchiveInputStream tis,
TarArchiveEntry entry, File outputDir) throws IOException {
if (entry.isDirectory()) {
File subDir = new File(outputDir, entry.getName());
if (!subDir.mkdir() && !subDir.isDirectory()) {
throw new IOException("Mkdirs failed to create tar internal dir "
+ outputDir);
}
for (TarArchiveEntry e : entry.getDirectoryEntries()) {
unpackEntries(tis, e, subDir);
}
return;
}
File outputFile = new File(outputDir, entry.getName());
if (!outputDir.exists()) {
if (!outputDir.mkdirs()) {
throw new IOException("Mkdirs failed to create tar internal dir "
+ outputDir);
}
}
int count;
byte data[] = new byte[2048];
BufferedOutputStream outputStream = new BufferedOutputStream(
new FileOutputStream(outputFile));
while ((count = tis.read(data)) != -1) {
outputStream.write(data, 0, count);
}
outputStream.flush();
outputStream.close();
}
/**
* Class for creating hardlinks.
* Supports Unix, Cygwin, WindXP.

View File

@ -159,6 +159,11 @@ public class FilterFileSystem extends FileSystem {
return fs.append(f, bufferSize, progress);
}
@Override
public void concat(Path f, Path[] psrcs) throws IOException {
fs.concat(f, psrcs);
}
@Override
public FSDataOutputStream create(Path f, FsPermission permission,
boolean overwrite, int bufferSize, short replication, long blockSize,

View File

@ -55,6 +55,9 @@ public enum Errno {
EPIPE,
EDOM,
ERANGE,
ELOOP,
ENAMETOOLONG,
ENOTEMPTY,
UNKNOWN;
}

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.io.nativeio;
import java.io.File;
import java.io.FileDescriptor;
import java.io.IOException;
import java.util.Map;
@ -293,4 +294,35 @@ public class NativeIO {
stat.group = getName(IdCache.GROUP, stat.groupId);
return stat;
}
/**
* A version of renameTo that throws a descriptive exception when it fails.
*
* @param src The source path
* @param dst The destination path
*
* @throws NativeIOException On failure.
*/
public static void renameTo(File src, File dst)
throws IOException {
if (!nativeLoaded) {
if (!src.renameTo(dst)) {
throw new IOException("renameTo(src=" + src + ", dst=" +
dst + ") failed.");
}
} else {
renameTo0(src.getAbsolutePath(), dst.getAbsolutePath());
}
}
/**
* A version of renameTo that throws a descriptive exception when it fails.
*
* @param src The source path
* @param dst The destination path
*
* @throws NativeIOException On failure.
*/
private static native void renameTo0(String src, String dst)
throws NativeIOException;
}

View File

@ -24,11 +24,12 @@
#include <grp.h>
#include <jni.h>
#include <pwd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include "config.h"
@ -502,6 +503,26 @@ ssize_t get_pw_buflen() {
#endif
return (ret > 512) ? ret : 512;
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_io_nativeio_NativeIO_renameTo0(JNIEnv *env,
jclass clazz, jstring jsrc, jstring jdst)
{
const char *src = NULL, *dst = NULL;
src = (*env)->GetStringUTFChars(env, jsrc, NULL);
if (!src) goto done; // exception was thrown
dst = (*env)->GetStringUTFChars(env, jdst, NULL);
if (!dst) goto done; // exception was thrown
if (rename(src, dst)) {
throw_ioe(env, errno);
}
done:
if (src) (*env)->ReleaseStringUTFChars(env, jsrc, src);
if (dst) (*env)->ReleaseStringUTFChars(env, jdst, dst);
}
/**
* vim: sw=2: ts=2: et:
*/

View File

@ -63,6 +63,9 @@ static errno_mapping_t ERRNO_MAPPINGS[] = {
MAPPING(EPIPE),
MAPPING(EDOM),
MAPPING(ERANGE),
MAPPING(ELOOP),
MAPPING(ENAMETOOLONG),
MAPPING(ENOTEMPTY),
{-1, NULL}
};

View File

@ -0,0 +1,183 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Native Libraries Guide
---
---
${maven.build.timestamp}
Native Libraries Guide
%{toc|section=1|fromDepth=0}
* Overview
This guide describes the native hadoop library and includes a small
discussion about native shared libraries.
Note: Depending on your environment, the term "native libraries" could
refer to all *.so's you need to compile; and, the term "native
compression" could refer to all *.so's you need to compile that are
specifically related to compression. Currently, however, this document
only addresses the native hadoop library (<<<libhadoop.so>>>).
* Native Hadoop Library
Hadoop has native implementations of certain components for performance
reasons and for non-availability of Java implementations. These
components are available in a single, dynamically-linked native library
called the native hadoop library. On the *nix platforms the library is
named <<<libhadoop.so>>>.
* Usage
It is fairly easy to use the native hadoop library:
[[1]] Review the components.
[[2]] Review the supported platforms.
[[3]] Either download a hadoop release, which will include a pre-built
version of the native hadoop library, or build your own version of
the native hadoop library. Whether you download or build, the name
for the library is the same: libhadoop.so
[[4]] Install the compression codec development packages (>zlib-1.2,
>gzip-1.2):
+ If you download the library, install one or more development
packages - whichever compression codecs you want to use with
your deployment.
+ If you build the library, it is mandatory to install both
development packages.
[[5]] Check the runtime log files.
* Components
The native hadoop library includes two components, the zlib and gzip
compression codecs:
* zlib
* gzip
The native hadoop library is imperative for gzip to work.
* Supported Platforms
The native hadoop library is supported on *nix platforms only. The
library does not to work with Cygwin or the Mac OS X platform.
The native hadoop library is mainly used on the GNU/Linus platform and
has been tested on these distributions:
* RHEL4/Fedora
* Ubuntu
* Gentoo
On all the above distributions a 32/64 bit native hadoop library will
work with a respective 32/64 bit jvm.
* Download
The pre-built 32-bit i386-Linux native hadoop library is available as
part of the hadoop distribution and is located in the <<<lib/native>>>
directory. You can download the hadoop distribution from Hadoop Common
Releases.
Be sure to install the zlib and/or gzip development packages -
whichever compression codecs you want to use with your deployment.
* Build
The native hadoop library is written in ANSI C and is built using the
GNU autotools-chain (autoconf, autoheader, automake, autoscan,
libtool). This means it should be straight-forward to build the library
on any platform with a standards-compliant C compiler and the GNU
autotools-chain (see the supported platforms).
The packages you need to install on the target platform are:
* C compiler (e.g. GNU C Compiler)
* GNU Autools Chain: autoconf, automake, libtool
* zlib-development package (stable version >= 1.2.0)
Once you installed the prerequisite packages use the standard hadoop
build.xml file and pass along the compile.native flag (set to true) to
build the native hadoop library:
----
$ ant -Dcompile.native=true <target>
----
You should see the newly-built library in:
----
$ build/native/<platform>/lib
----
where <platform> is a combination of the system-properties:
${os.name}-${os.arch}-${sun.arch.data.model} (for example,
Linux-i386-32).
Please note the following:
* It is mandatory to install both the zlib and gzip development
packages on the target platform in order to build the native hadoop
library; however, for deployment it is sufficient to install just
one package if you wish to use only one codec.
* It is necessary to have the correct 32/64 libraries for zlib,
depending on the 32/64 bit jvm for the target platform, in order to
build and deploy the native hadoop library.
* Runtime
The bin/hadoop script ensures that the native hadoop library is on the
library path via the system property:
<<<-Djava.library.path=<path> >>>
During runtime, check the hadoop log files for your MapReduce tasks.
* If everything is all right, then:
<<<DEBUG util.NativeCodeLoader - Trying to load the custom-built native-hadoop library...>>>
<<<INFO util.NativeCodeLoader - Loaded the native-hadoop library>>>
* If something goes wrong, then:
<<<INFO util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable>>>
* Native Shared Libraries
You can load any native shared library using DistributedCache for
distributing and symlinking the library files.
This example shows you how to distribute a shared library, mylib.so,
and load it from a MapReduce task.
[[1]] First copy the library to the HDFS:
<<<bin/hadoop fs -copyFromLocal mylib.so.1 /libraries/mylib.so.1>>>
[[2]] The job launching program should contain the following:
<<<DistributedCache.createSymlink(conf);>>>
<<<DistributedCache.addCacheFile("hdfs://host:port/libraries/mylib.so. 1#mylib.so", conf);>>>
[[3]] The MapReduce task can contain:
<<<System.loadLibrary("mylib.so");>>>
Note: If you downloaded or built the native hadoop library, you dont
need to use DistibutedCache to make the library available to your
MapReduce tasks.

View File

@ -0,0 +1,164 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Service Level Authorization Guide
---
---
${maven.build.timestamp}
Service Level Authorization Guide
%{toc|section=1|fromDepth=0}
* Purpose
This document describes how to configure and manage Service Level
Authorization for Hadoop.
* Prerequisites
Make sure Hadoop is installed, configured and setup correctly. For more
information see:
* Single Node Setup for first-time users.
* Cluster Setup for large, distributed clusters.
* Overview
Service Level Authorization is the initial authorization mechanism to
ensure clients connecting to a particular Hadoop service have the
necessary, pre-configured, permissions and are authorized to access the
given service. For example, a MapReduce cluster can use this mechanism
to allow a configured list of users/groups to submit jobs.
The <<<${HADOOP_CONF_DIR}/hadoop-policy.xml>>> configuration file is used to
define the access control lists for various Hadoop services.
Service Level Authorization is performed much before to other access
control checks such as file-permission checks, access control on job
queues etc.
* Configuration
This section describes how to configure service-level authorization via
the configuration file <<<${HADOOP_CONF_DIR}/hadoop-policy.xml>>>.
** Enable Service Level Authorization
By default, service-level authorization is disabled for Hadoop. To
enable it set the configuration property hadoop.security.authorization
to true in <<<${HADOOP_CONF_DIR}/core-site.xml>>>.
** Hadoop Services and Configuration Properties
This section lists the various Hadoop services and their configuration
knobs:
*-------------------------------------+--------------------------------------+
|| Property || Service
*-------------------------------------+--------------------------------------+
security.client.protocol.acl | ACL for ClientProtocol, which is used by user code via the DistributedFileSystem.
*-------------------------------------+--------------------------------------+
security.client.datanode.protocol.acl | ACL for ClientDatanodeProtocol, the client-to-datanode protocol for block recovery.
*-------------------------------------+--------------------------------------+
security.datanode.protocol.acl | ACL for DatanodeProtocol, which is used by datanodes to communicate with the namenode.
*-------------------------------------+--------------------------------------+
security.inter.datanode.protocol.acl | ACL for InterDatanodeProtocol, the inter-datanode protocol for updating generation timestamp.
*-------------------------------------+--------------------------------------+
security.namenode.protocol.acl | ACL for NamenodeProtocol, the protocol used by the secondary namenode to communicate with the namenode.
*-------------------------------------+--------------------------------------+
security.inter.tracker.protocol.acl | ACL for InterTrackerProtocol, used by the tasktrackers to communicate with the jobtracker.
*-------------------------------------+--------------------------------------+
security.job.submission.protocol.acl | ACL for JobSubmissionProtocol, used by job clients to communciate with the jobtracker for job submission, querying job status etc.
*-------------------------------------+--------------------------------------+
security.task.umbilical.protocol.acl | ACL for TaskUmbilicalProtocol, used by the map and reduce tasks to communicate with the parent tasktracker.
*-------------------------------------+--------------------------------------+
security.refresh.policy.protocol.acl | ACL for RefreshAuthorizationPolicyProtocol, used by the dfsadmin and mradmin commands to refresh the security policy in-effect.
*-------------------------------------+--------------------------------------+
security.ha.service.protocol.acl | ACL for HAService protocol used by HAAdmin to manage the active and stand-by states of namenode.
*-------------------------------------+--------------------------------------+
** Access Control Lists
<<<${HADOOP_CONF_DIR}/hadoop-policy.xml>>> defines an access control list for
each Hadoop service. Every access control list has a simple format:
The list of users and groups are both comma separated list of names.
The two lists are separated by a space.
Example: <<<user1,user2 group1,group2>>>.
Add a blank at the beginning of the line if only a list of groups is to
be provided, equivalently a comman-separated list of users followed by
a space or nothing implies only a set of given users.
A special value of <<<*>>> implies that all users are allowed to access the
service.
** Refreshing Service Level Authorization Configuration
The service-level authorization configuration for the NameNode and
JobTracker can be changed without restarting either of the Hadoop
master daemons. The cluster administrator can change
<<<${HADOOP_CONF_DIR}/hadoop-policy.xml>>> on the master nodes and instruct
the NameNode and JobTracker to reload their respective configurations
via the <<<-refreshServiceAcl>>> switch to <<<dfsadmin>>> and <<<mradmin>>> commands
respectively.
Refresh the service-level authorization configuration for the NameNode:
----
$ bin/hadoop dfsadmin -refreshServiceAcl
----
Refresh the service-level authorization configuration for the
JobTracker:
----
$ bin/hadoop mradmin -refreshServiceAcl
----
Of course, one can use the <<<security.refresh.policy.protocol.acl>>>
property in <<<${HADOOP_CONF_DIR}/hadoop-policy.xml>>> to restrict access to
the ability to refresh the service-level authorization configuration to
certain users/groups.
** Examples
Allow only users <<<alice>>>, <<<bob>>> and users in the <<<mapreduce>>> group to submit
jobs to the MapReduce cluster:
----
<property>
<name>security.job.submission.protocol.acl</name>
<value>alice,bob mapreduce</value>
</property>
----
Allow only DataNodes running as the users who belong to the group
datanodes to communicate with the NameNode:
----
<property>
<name>security.datanode.protocol.acl</name>
<value>datanodes</value>
</property>
----
Allow any user to talk to the HDFS cluster as a DFSClient:
----
<property>
<name>security.client.protocol.acl</name>
<value>*</value>
</property>
----

View File

@ -0,0 +1,239 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Single Node Setup
---
---
${maven.build.timestamp}
Single Node Setup
%{toc|section=1|fromDepth=0}
* Purpose
This document describes how to set up and configure a single-node
Hadoop installation so that you can quickly perform simple operations
using Hadoop MapReduce and the Hadoop Distributed File System (HDFS).
* Prerequisites
** Supported Platforms
* GNU/Linux is supported as a development and production platform.
Hadoop has been demonstrated on GNU/Linux clusters with 2000 nodes.
* Win32 is supported as a development platform. Distributed operation
has not been well tested on Win32, so it is not supported as a
production platform.
** Required Software
Required software for Linux and Windows include:
[[1]] Java^TM 1.6.x, preferably from Sun, must be installed.
[[2]] ssh must be installed and sshd must be running to use the Hadoop
scripts that manage remote Hadoop daemons.
Additional requirements for Windows include:
[[1]] Cygwin - Required for shell support in addition to the required
software above.
** Installing Software
If your cluster doesn't have the requisite software you will need to
install it.
For example on Ubuntu Linux:
----
$ sudo apt-get install ssh
$ sudo apt-get install rsync
----
On Windows, if you did not install the required software when you
installed cygwin, start the cygwin installer and select the packages:
* openssh - the Net category
* Download
To get a Hadoop distribution, download a recent stable release from one
of the Apache Download Mirrors.
* Prepare to Start the Hadoop Cluster
Unpack the downloaded Hadoop distribution. In the distribution, edit
the file <<<conf/hadoop-env.sh>>> to define at least <<<JAVA_HOME>>> to be the root
of your Java installation.
Try the following command:
----
$ bin/hadoop
----
This will display the usage documentation for the hadoop script.
Now you are ready to start your Hadoop cluster in one of the three
supported modes:
* Local (Standalone) Mode
* Pseudo-Distributed Mode
* Fully-Distributed Mode
* Standalone Operation
By default, Hadoop is configured to run in a non-distributed mode, as a
single Java process. This is useful for debugging.
The following example copies the unpacked conf directory to use as
input and then finds and displays every match of the given regular
expression. Output is written to the given output directory.
----
$ mkdir input
$ cp conf/*.xml input
$ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
$ cat output/*
---
* Pseudo-Distributed Operation
Hadoop can also be run on a single-node in a pseudo-distributed mode
where each Hadoop daemon runs in a separate Java process.
** Configuration
Use the following:
conf/core-site.xml:
----
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
----
conf/hdfs-site.xml:
----
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
----
conf/mapred-site.xml:
----
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>localhost:9001</value>
</property>
</configuration>
----
** Setup passphraseless ssh
Now check that you can ssh to the localhost without a passphrase:
----
$ ssh localhost
----
If you cannot ssh to localhost without a passphrase, execute the
following commands:
----
$ ssh-keygen -t dsa -P '' -f ~/.ssh/id_dsa
$ cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
----
** Execution
Format a new distributed-filesystem:
----
$ bin/hadoop namenode -format
----
Start the hadoop daemons:
----
$ bin/start-all.sh
----
The hadoop daemon log output is written to the <<<${HADOOP_LOG_DIR}>>>
directory (defaults to <<<${HADOOP_PREFIX}/logs>>>).
Browse the web interface for the NameNode and the JobTracker; by
default they are available at:
* NameNode - <<<http://localhost:50070/>>>
* JobTracker - <<<http://localhost:50030/>>>
Copy the input files into the distributed filesystem:
----
$ bin/hadoop fs -put conf input
----
Run some of the examples provided:
----
$ bin/hadoop jar hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
----
Examine the output files:
Copy the output files from the distributed filesystem to the local
filesytem and examine them:
----
$ bin/hadoop fs -get output output
$ cat output/*
----
or
View the output files on the distributed filesystem:
----
$ bin/hadoop fs -cat output/*
----
When you're done, stop the daemons with:
----
$ bin/stop-all.sh
----
* Fully-Distributed Operation
For information on setting up fully-distributed, non-trivial clusters
see {{{Cluster Setup}}}.
Java and JNI are trademarks or registered trademarks of Sun
Microsystems, Inc. in the United States and other countries.

View File

@ -0,0 +1,100 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Superusers Acting On Behalf Of Other Users
---
---
${maven.build.timestamp}
Superusers Acting On Behalf Of Other Users
%{toc|section=1|fromDepth=0}
* Introduction
This document describes how a superuser can submit jobs or access hdfs
on behalf of another user in a secured way.
* Use Case
The code example described in the next section is applicable for the
following use case.
A superuser with username 'super' wants to submit job and access hdfs
on behalf of a user joe. The superuser has kerberos credentials but
user joe doesn't have any. The tasks are required to run as user joe
and any file accesses on namenode are required to be done as user joe.
It is required that user joe can connect to the namenode or job tracker
on a connection authenticated with super's kerberos credentials. In
other words super is impersonating the user joe.
* Code example
In this example super's kerberos credentials are used for login and a
proxy user ugi object is created for joe. The operations are performed
within the doAs method of this proxy user ugi object.
----
...
//Create ugi for joe. The login user is 'super'.
UserGroupInformation ugi =
UserGroupInformation.createProxyUser("joe", UserGroupInformation.getLoginUser());
ugi.doAs(new PrivilegedExceptionAction<Void>() {
public Void run() throws Exception {
//Submit a job
JobClient jc = new JobClient(conf);
jc.submitJob(conf);
//OR access hdfs
FileSystem fs = FileSystem.get(conf);
fs.mkdir(someFilePath);
}
}
----
* Configurations
The superuser must be configured on namenode and jobtracker to be
allowed to impersonate another user. Following configurations are
required.
----
<property>
<name>hadoop.proxyuser.super.groups</name>
<value>group1,group2</value>
<description>Allow the superuser super to impersonate any members of the group group1 and group2</description>
</property>
<property>
<name>hadoop.proxyuser.super.hosts</name>
<value>host1,host2</value>
<description>The superuser can connect only from host1 and host2 to impersonate a user</description>
</property>
----
If these configurations are not present, impersonation will not be
allowed and connection will fail.
If more lax security is preferred, the wildcard value * may be used to
allow impersonation from any host or of any user.
* Caveats
The superuser must have kerberos credentials to be able to impersonate
another user. It cannot use delegation tokens for this feature. It
would be wrong if superuser adds its own delegation token to the proxy
user ugi, as it will allow the proxy user to connect to the service
with the privileges of the superuser.
However, if the superuser does want to give a delegation token to joe,
it must first impersonate joe and get a delegation token for joe, in
the same way as the code example above, and add it to the ugi of joe.
In this way the delegation token will have the owner as joe.

View File

@ -546,4 +546,44 @@ public class TestFileUtil {
long expected = 2 * (3 + System.getProperty("line.separator").length());
Assert.assertEquals(expected, du);
}
private void doUntarAndVerify(File tarFile, File untarDir)
throws IOException {
if (untarDir.exists() && !FileUtil.fullyDelete(untarDir)) {
throw new IOException("Could not delete directory '" + untarDir + "'");
}
FileUtil.unTar(tarFile, untarDir);
String parentDir = untarDir.getCanonicalPath() + Path.SEPARATOR + "name";
File testFile = new File(parentDir + Path.SEPARATOR + "version");
Assert.assertTrue(testFile.exists());
Assert.assertTrue(testFile.length() == 0);
String imageDir = parentDir + Path.SEPARATOR + "image";
testFile = new File(imageDir + Path.SEPARATOR + "fsimage");
Assert.assertTrue(testFile.exists());
Assert.assertTrue(testFile.length() == 157);
String currentDir = parentDir + Path.SEPARATOR + "current";
testFile = new File(currentDir + Path.SEPARATOR + "fsimage");
Assert.assertTrue(testFile.exists());
Assert.assertTrue(testFile.length() == 4331);
testFile = new File(currentDir + Path.SEPARATOR + "edits");
Assert.assertTrue(testFile.exists());
Assert.assertTrue(testFile.length() == 1033);
testFile = new File(currentDir + Path.SEPARATOR + "fstime");
Assert.assertTrue(testFile.exists());
Assert.assertTrue(testFile.length() == 8);
}
@Test
public void testUntar() throws IOException {
String tarGzFileName = System.getProperty("test.cache.data",
"build/test/cache") + "/test-untar.tgz";
String tarFileName = System.getProperty("test.cache.data",
"build/test/cache") + "/test-untar.tar";
String dataDir = System.getProperty("test.build.data", "build/test/data");
File untarDir = new File(dataDir, "untarDir");
doUntarAndVerify(new File(tarGzFileName), untarDir);
doUntarAndVerify(new File(tarFileName), untarDir);
}
}

View File

@ -25,11 +25,14 @@ import java.io.IOException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.ArrayList;
import java.util.List;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assume.*;
import static org.junit.Assert.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@ -293,4 +296,40 @@ public class TestNativeIO {
assertFalse(NativeIO.getGroupName(0).isEmpty());
}
@Test
public void testRenameTo() throws Exception {
final File TEST_DIR = new File(new File(
System.getProperty("test.build.data","build/test/data")), "renameTest");
assumeTrue(TEST_DIR.mkdirs());
File nonExistentFile = new File(TEST_DIR, "nonexistent");
File targetFile = new File(TEST_DIR, "target");
// Test attempting to rename a nonexistent file.
try {
NativeIO.renameTo(nonExistentFile, targetFile);
Assert.fail();
} catch (NativeIOException e) {
Assert.assertEquals(e.getErrno(), Errno.ENOENT);
}
// Test renaming a file to itself. It should succeed and do nothing.
File sourceFile = new File(TEST_DIR, "source");
Assert.assertTrue(sourceFile.createNewFile());
NativeIO.renameTo(sourceFile, sourceFile);
// Test renaming a source to a destination.
NativeIO.renameTo(sourceFile, targetFile);
// Test renaming a source to a path which uses a file as a directory.
sourceFile = new File(TEST_DIR, "source");
Assert.assertTrue(sourceFile.createNewFile());
File badTarget = new File(targetFile, "subdir");
try {
NativeIO.renameTo(sourceFile, badTarget);
Assert.fail();
} catch (NativeIOException e) {
Assert.assertEquals(e.getErrno(), Errno.ENOTDIR);
}
FileUtils.deleteQuietly(TEST_DIR);
}
}

View File

@ -56,6 +56,7 @@ import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
/**
* Test the MetricsSystemImpl class
@ -80,7 +81,7 @@ public class TestMetricsSystemImpl {
}
}
@Test public void testInitFirst() throws Exception {
@Test public void testInitFirstVerifyStopInvokedImmediately() throws Exception {
new ConfigBuilder().add("*.period", 8)
//.add("test.sink.plugin.urls", getPluginUrlsAsString())
.add("test.sink.test.class", TestSink.class.getName())
@ -106,14 +107,61 @@ public class TestMetricsSystemImpl {
ms.stop();
ms.shutdown();
verify(sink1, times(2)).putMetrics(r1.capture());
//When we call stop, at most two sources will be consumed by each sink thread.
verify(sink1, atMost(2)).putMetrics(r1.capture());
List<MetricsRecord> mr1 = r1.getAllValues();
verify(sink2, atMost(2)).putMetrics(r2.capture());
List<MetricsRecord> mr2 = r2.getAllValues();
if (mr1.size() != 0 && mr2.size() != 0) {
checkMetricsRecords(mr1);
assertEquals("output", mr1, mr2);
} else if (mr1.size() != 0) {
checkMetricsRecords(mr1);
} else if (mr2.size() != 0) {
checkMetricsRecords(mr2);
}
}
@Test public void testInitFirstVerifyCallBacks() throws Exception {
DefaultMetricsSystem.shutdown();
new ConfigBuilder().add("*.period", 8)
//.add("test.sink.plugin.urls", getPluginUrlsAsString())
.add("test.sink.test.class", TestSink.class.getName())
.add("test.*.source.filter.exclude", "s0")
.add("test.source.s1.metric.filter.exclude", "X*")
.add("test.sink.sink1.metric.filter.exclude", "Y*")
.add("test.sink.sink2.metric.filter.exclude", "Y*")
.save(TestMetricsConfig.getTestFilename("hadoop-metrics2-test"));
MetricsSystemImpl ms = new MetricsSystemImpl("Test");
ms.start();
ms.register("s0", "s0 desc", new TestSource("s0rec"));
TestSource s1 = ms.register("s1", "s1 desc", new TestSource("s1rec"));
s1.c1.incr();
s1.xxx.incr();
s1.g1.set(2);
s1.yyy.incr(2);
s1.s1.add(0);
MetricsSink sink1 = mock(MetricsSink.class);
MetricsSink sink2 = mock(MetricsSink.class);
ms.registerSink("sink1", "sink1 desc", sink1);
ms.registerSink("sink2", "sink2 desc", sink2);
ms.publishMetricsNow(); // publish the metrics
try {
verify(sink1, timeout(200).times(2)).putMetrics(r1.capture());
verify(sink2, timeout(200).times(2)).putMetrics(r2.capture());
} finally {
ms.stop();
ms.shutdown();
}
//When we call stop, at most two sources will be consumed by each sink thread.
List<MetricsRecord> mr1 = r1.getAllValues();
verify(sink2, times(2)).putMetrics(r2.capture());
List<MetricsRecord> mr2 = r2.getAllValues();
checkMetricsRecords(mr1);
assertEquals("output", mr1, mr2);
}
}
@Test public void testMultiThreadedPublish() throws Exception {
new ConfigBuilder().add("*.period", 80)
.add("test.sink.Collector.queue.capacity", "20")

View File

@ -314,6 +314,10 @@ Release 2.0.3-alpha - Unreleased
HDFS-4369. GetBlockKeysResponseProto does not handle null response.
(suresh)
HDFS-4451. hdfs balancer command returns exit code 1 on success instead
of 0. (Joshua Blatt via suresh)
NEW FEATURES
HDFS-2656. Add libwebhdfs, a pure C client based on WebHDFS.
@ -499,6 +503,8 @@ Release 2.0.3-alpha - Unreleased
HDFS-4259. Improve pipeline DN replacement failure message (harsh)
HDFS-3598. WebHDFS support for file concat. (Plamen Jeliazkov via shv)
OPTIMIZATIONS
HDFS-3429. DataNode reads checksums even if client does not need them (todd)
@ -733,6 +739,9 @@ Release 2.0.3-alpha - Unreleased
HDFS-4444. Add space between total transaction time and number of
transactions in FSEditLog#printStatistics. (Stephen Chu via suresh)
HDFS-4428. FsDatasetImpl should disclose what the error is when a rename
fails. (Colin Patrick McCabe via atm)
BREAKDOWN OF HDFS-3077 SUBTASKS
HDFS-3077. Quorum-based protocol for reading and writing edit logs.

View File

@ -1,237 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>Synthetic Load Generator Guide </title>
</header>
<body>
<section>
<title>Overview</title>
<p>
The synthetic load generator (SLG) is a tool for testing NameNode behavior
under different client loads. The user can generate different mixes
of read, write, and list requests by specifying the probabilities of
read and write. The user controls the intensity of the load by adjusting
parameters for the number of worker threads and the delay between
operations. While load generators are running, the user can profile and
monitor the running of the NameNode. When a load generator exits, it
prints some NameNode statistics like the average execution time of each
kind of operation and the NameNode throughput.
</p>
</section>
<section>
<title> Synopsis </title>
<p>
The synopsis of the command is:
</p>
<source>java LoadGenerator [options]</source>
<p> Options include:</p>
<ul>
<li>
<code>-readProbability &lt;read probability&gt;</code><br/>
The probability of the read operation; default is 0.3333.
</li>
<li>
<code>-writeProbability &lt;write probability&gt;</code><br/>
The probability of the write operations; default is 0.3333.
</li>
<li>
<code>-root &lt;test space root&gt;</code><br/>
The root of the test space; default is /testLoadSpace.
</li>
<li>
<code>-maxDelayBetweenOps &lt;maxDelayBetweenOpsInMillis&gt;</code><br/>
The maximum delay between two consecutive operations in a thread; default is 0 indicating no delay.
</li>
<li>
<code>-numOfThreads &lt;numOfThreads&gt;</code><br/>
The number of threads to spawn; default is 200.
</li>
<li>
<code>-elapsedTime &lt;elapsedTimeInSecs&gt;</code><br/>
The number of seconds that the program
will run; A value of zero indicates that the program runs
forever. The default value is 0.
</li>
<li>
<code>-startTime &lt;startTimeInMillis&gt;</code><br/>
The time that all worker threads
start to run. By default it is 10 seconds after the main
program starts running.This creates a barrier if more than
one load generator is running.
</li>
<li>
<code>-seed &lt;seed&gt;</code><br/>
The random generator seed for repeating
requests to NameNode when running with a single thread;
default is the current time.
</li>
</ul>
<p>
After command line argument parsing, the load generator traverses
the test space and builds a table of all directories and another table
of all files in the test space. It then waits until the start time to
spawn the number of worker threads as specified by the user.
Each thread sends a stream of requests to NameNode. At each iteration,
it first decides if it is going to read a file, create a file, or
list a directory following the read and write probabilities specified
by the user. The listing probability is equal to
<em>1-read probability-write probability</em>. When reading,
it randomly picks a file in the test space and reads the entire file.
When writing, it randomly picks a directory in the test space and
creates a file there.
</p>
<p>
To avoid two threads with the same load
generator or from two different load generators creating the same
file, the file name consists of the current machine's host name
and the thread id. The length of the file follows Gaussian
distribution with an average size of 2 blocks and the standard
deviation of 1. The new file is filled with byte 'a'. To avoid the test
space growing indefinitely, the file is deleted immediately
after the file creation completes. While listing, it randomly picks
a directory in the test space and lists its content.
</p>
<p>
After an operation completes, the thread pauses for a random
amount of time in the range of [0, maxDelayBetweenOps] if the
specified maximum delay is not zero. All threads are stopped when
the specified elapsed time is passed. Before exiting, the program
prints the average execution for each kind of NameNode operations,
and the number of requests served by the NameNode per second.
</p>
</section>
<section>
<title> Test Space Population </title>
<p>
The user needs to populate a test space before running a
load generator. The structure generator generates a random
test space structure and the data generator creates the files
and directories of the test space in Hadoop distributed file system.
</p>
<section>
<title> Structure Generator </title>
<p>
This tool generates a random namespace structure with the
following constraints:
</p>
<ol>
<li>The number of subdirectories that a directory can have is
a random number in [minWidth, maxWidth].</li>
<li>The maximum depth of each subdirectory is a random number
[2*maxDepth/3, maxDepth].</li>
<li>Files are randomly placed in leaf directories. The size of
each file follows Gaussian distribution with an average size
of 1 block and a standard deviation of 1.</li>
</ol>
<p>
The generated namespace structure is described by two files in
the output directory. Each line of the first file contains the
full name of a leaf directory. Each line of the second file
contains the full name of a file and its size, separated by a blank.
</p>
<p>
The synopsis of the command is:
</p>
<source>java StructureGenerator [options]</source>
<p>Options include:</p>
<ul>
<li>
<code>-maxDepth &lt;maxDepth&gt;</code><br/>
Maximum depth of the directory tree; default is 5.
</li>
<li>
<code>-minWidth &lt;minWidth&gt;</code><br/>
Minimum number of subdirectories per directories; default is 1.
</li>
<li>
<code>-maxWidth &lt;maxWidth&gt;</code><br/>
Maximum number of subdirectories per directories; default is 5.
</li>
<li>
<code>-numOfFiles &lt;#OfFiles&gt;</code><br/>
The total number of files in the test space; default is 10.
</li>
<li>
<code>-avgFileSize &lt;avgFileSizeInBlocks&gt;</code><br/>
Average size of blocks; default is 1.
</li>
<li>
<code>-outDir &lt;outDir&gt;</code><br/>
Output directory; default is the current directory.
</li>
<li>
<code>-seed &lt;seed&gt;</code><br/>
Random number generator seed; default is the current time.
</li>
</ul>
</section>
<section>
<title>Data Generator </title>
<p>
This tool reads the directory structure and file structure from
the input directory and creates the namespace in Hadoop distributed
file system. All files are filled with byte 'a'.
</p>
<p>
The synopsis of the command is:
</p>
<source>java DataGenerator [options]</source>
<p>Options include:</p>
<ul>
<li>
<code>-inDir &lt;inDir&gt;</code><br/>
Input directory name where directory/file
structures are stored; default is the current directory.
</li>
<li>
<code>-root &lt;test space root&gt;</code><br/>
The name of the root directory which the
new namespace is going to be placed under;
default is "/testLoadSpace".
</li>
</ul>
</section>
</section>
</body>
</document>

View File

@ -1,395 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>Fault Injection Framework and Development Guide</title>
</header>
<body>
<section>
<title>Introduction</title>
<p>This guide provides an overview of the Hadoop Fault Injection (FI) framework for those
who will be developing their own faults (aspects).
</p>
<p>The idea of fault injection is fairly simple: it is an
infusion of errors and exceptions into an application's logic to
achieve a higher coverage and fault tolerance of the system.
Different implementations of this idea are available today.
Hadoop's FI framework is built on top of Aspect Oriented Paradigm
(AOP) implemented by AspectJ toolkit.
</p>
</section>
<section>
<title>Assumptions</title>
<p>The current implementation of the FI framework assumes that the faults it
will be emulating are of non-deterministic nature. That is, the moment
of a fault's happening isn't known in advance and is a coin-flip based.
</p>
</section>
<section>
<title>Architecture of the Fault Injection Framework</title>
<figure src="images/FI-framework.gif" alt="Components layout" />
<section>
<title>Configuration Management</title>
<p>This piece of the FI framework allows you to set expectations for faults to happen.
The settings can be applied either statically (in advance) or in runtime.
The desired level of faults in the framework can be configured two ways:
</p>
<ul>
<li>
editing
<code>src/aop/fi-site.xml</code>
configuration file. This file is similar to other Hadoop's config
files
</li>
<li>
setting system properties of JVM through VM startup parameters or in
<code>build.properties</code>
file
</li>
</ul>
</section>
<section>
<title>Probability Model</title>
<p>This is fundamentally a coin flipper. The methods of this class are
getting a random number between 0.0
and 1.0 and then checking if a new number has happened in the
range of 0.0 and a configured level for the fault in question. If that
condition is true then the fault will occur.
</p>
<p>Thus, to guarantee the happening of a fault one needs to set an
appropriate level to 1.0.
To completely prevent a fault from happening its probability level
has to be set to 0.0.
</p>
<p><strong>Note</strong>: The default probability level is set to 0
(zero) unless the level is changed explicitly through the
configuration file or in the runtime. The name of the default
level's configuration parameter is
<code>fi.*</code>
</p>
</section>
<section>
<title>Fault Injection Mechanism: AOP and AspectJ</title>
<p>The foundation of Hadoop's FI framework includes a
cross-cutting concept implemented by AspectJ. The following basic
terms are important to remember:
</p>
<ul>
<li>
<strong>A cross-cutting concept</strong>
(aspect) is behavior, and often data, that is used across the scope
of a piece of software
</li>
<li>In AOP, the
<strong>aspects</strong>
provide a mechanism by which a cross-cutting concern can be
specified in a modular way
</li>
<li>
<strong>Advice</strong>
is the
code that is executed when an aspect is invoked
</li>
<li>
<strong>Join point</strong>
(or pointcut) is a specific
point within the application that may or not invoke some advice
</li>
</ul>
</section>
<section>
<title>Existing Join Points</title>
<p>
The following readily available join points are provided by AspectJ:
</p>
<ul>
<li>Join when a method is called
</li>
<li>Join during a method's execution
</li>
<li>Join when a constructor is invoked
</li>
<li>Join during a constructor's execution
</li>
<li>Join during aspect advice execution
</li>
<li>Join before an object is initialized
</li>
<li>Join during object initialization
</li>
<li>Join during static initializer execution
</li>
<li>Join when a class's field is referenced
</li>
<li>Join when a class's field is assigned
</li>
<li>Join when a handler is executed
</li>
</ul>
</section>
</section>
<section>
<title>Aspect Example</title>
<source>
package org.apache.hadoop.hdfs.server.datanode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fi.ProbabilityModel;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.util.DiskChecker.*;
import java.io.IOException;
import java.io.OutputStream;
import java.io.DataOutputStream;
/**
* This aspect takes care about faults injected into datanode.BlockReceiver
* class
*/
public aspect BlockReceiverAspects {
public static final Log LOG = LogFactory.getLog(BlockReceiverAspects.class);
public static final String BLOCK_RECEIVER_FAULT="hdfs.datanode.BlockReceiver";
pointcut callReceivePacket() : call (* OutputStream.write(..))
&amp;&amp; withincode (* BlockReceiver.receivePacket(..))
// to further limit the application of this aspect a very narrow 'target' can be used as follows
// &amp;&amp; target(DataOutputStream)
&amp;&amp; !within(BlockReceiverAspects +);
before () throws IOException : callReceivePacket () {
if (ProbabilityModel.injectCriteria(BLOCK_RECEIVER_FAULT)) {
LOG.info("Before the injection point");
Thread.dumpStack();
throw new DiskOutOfSpaceException ("FI: injected fault point at " +
thisJoinPoint.getStaticPart( ).getSourceLocation());
}
}
}
</source>
<p>The aspect has two main parts: </p>
<ul>
<li>The join point
<code>pointcut callReceivepacket()</code>
which servers as an identification mark of a specific point (in control
and/or data flow) in the life of an application. </li>
<li> A call to the advice -
<code>before () throws IOException : callReceivepacket()</code>
- will be injected (see
<a href="#Putting+it+all+together">Putting It All Together</a>)
before that specific spot of the application's code.</li>
</ul>
<p>The pointcut identifies an invocation of class'
<code>java.io.OutputStream write()</code>
method
with any number of parameters and any return type. This invoke should
take place within the body of method
<code>receivepacket()</code>
from class<code>BlockReceiver</code>.
The method can have any parameters and any return type.
Possible invocations of
<code>write()</code>
method happening anywhere within the aspect
<code>BlockReceiverAspects</code>
or its heirs will be ignored.
</p>
<p><strong>Note 1</strong>: This short example doesn't illustrate
the fact that you can have more than a single injection point per
class. In such a case the names of the faults have to be different
if a developer wants to trigger them separately.
</p>
<p><strong>Note 2</strong>: After the injection step (see
<a href="#Putting+it+all+together">Putting It All Together</a>)
you can verify that the faults were properly injected by
searching for <code>ajc</code> keywords in a disassembled class file.
</p>
</section>
<section>
<title>Fault Naming Convention and Namespaces</title>
<p>For the sake of a unified naming
convention the following two types of names are recommended for a
new aspects development:</p>
<ul>
<li>Activity specific notation
(when we don't care about a particular location of a fault's
happening). In this case the name of the fault is rather abstract:
<code>fi.hdfs.DiskError</code>
</li>
<li>Location specific notation.
Here, the fault's name is mnemonic as in:
<code>fi.hdfs.datanode.BlockReceiver[optional location details]</code>
</li>
</ul>
</section>
<section>
<title>Development Tools</title>
<ul>
<li>The Eclipse
<a href="http://www.eclipse.org/ajdt/">AspectJ Development Toolkit</a>
may help you when developing aspects
</li>
<li>IntelliJ IDEA provides AspectJ weaver and Spring-AOP plugins
</li>
</ul>
</section>
<section>
<title>Putting It All Together</title>
<p>Faults (aspects) have to injected (or woven) together before
they can be used. Follow these instructions:</p>
<ul>
<li>To weave aspects in place use:
<source>
% ant injectfaults
</source>
</li>
<li>If you
misidentified the join point of your aspect you will see a
warning (similar to the one shown here) when 'injectfaults' target is
completed:
<source>
[iajc] warning at
src/test/aop/org/apache/hadoop/hdfs/server/datanode/ \
BlockReceiverAspects.aj:44::0
advice defined in org.apache.hadoop.hdfs.server.datanode.BlockReceiverAspects
has not been applied [Xlint:adviceDidNotMatch]
</source>
</li>
<li>It isn't an error, so the build will report the successful result. <br />
To prepare dev.jar file with all your faults weaved in place (HDFS-475 pending) use:
<source>
% ant jar-fault-inject
</source>
</li>
<li>To create test jars use:
<source>
% ant jar-test-fault-inject
</source>
</li>
<li>To run HDFS tests with faults injected use:
<source>
% ant run-test-hdfs-fault-inject
</source>
</li>
</ul>
<section>
<title>How to Use the Fault Injection Framework</title>
<p>Faults can be triggered as follows:
</p>
<ul>
<li>During runtime:
<source>
% ant run-test-hdfs -Dfi.hdfs.datanode.BlockReceiver=0.12
</source>
To set a certain level, for example 25%, of all injected faults use:
<br/>
<source>
% ant run-test-hdfs-fault-inject -Dfi.*=0.25
</source>
</li>
<li>From a program:
<source>
package org.apache.hadoop.fs;
import org.junit.Test;
import org.junit.Before;
public class DemoFiTest {
public static final String BLOCK_RECEIVER_FAULT="hdfs.datanode.BlockReceiver";
@Override
@Before
public void setUp() {
//Setting up the test's environment as required
}
@Test
public void testFI() {
// It triggers the fault, assuming that there's one called 'hdfs.datanode.BlockReceiver'
System.setProperty("fi." + BLOCK_RECEIVER_FAULT, "0.12");
//
// The main logic of your tests goes here
//
// Now set the level back to 0 (zero) to prevent this fault from happening again
System.setProperty("fi." + BLOCK_RECEIVER_FAULT, "0.0");
// or delete its trigger completely
System.getProperties().remove("fi." + BLOCK_RECEIVER_FAULT);
}
@Override
@After
public void tearDown() {
//Cleaning up test test environment
}
}
</source>
</li>
</ul>
<p>
As you can see above these two methods do the same thing. They are
setting the probability level of <code>hdfs.datanode.BlockReceiver</code>
at 12%. The difference, however, is that the program provides more
flexibility and allows you to turn a fault off when a test no longer needs it.
</p>
</section>
</section>
<section>
<title>Additional Information and Contacts</title>
<p>These two sources of information are particularly
interesting and worth reading:
</p>
<ul>
<li>
<a href="http://www.eclipse.org/aspectj/doc/next/devguide/">
http://www.eclipse.org/aspectj/doc/next/devguide/
</a>
</li>
<li>AspectJ Cookbook (ISBN-13: 978-0-596-00654-9)
</li>
</ul>
<p>If you have additional comments or questions for the author check
<a href="http://issues.apache.org/jira/browse/HDFS-435">HDFS-435</a>.
</p>
</section>
</body>
</document>

View File

@ -1,157 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>Offline Edits Viewer Guide</title>
<authors>
<person name="Erik Steffl" email="steffl@yahoo-inc.com"/>
</authors>
</header>
<body>
<section>
<title>Overview</title>
<p>
Offline Edits Viewer is a tool to parse the Edits log file. The
current processors are mostly useful for conversion between
different formats, including XML which is human readable and
easier to edit than native binary format.
</p>
<p>
The tool can parse the edits formats -18 (roughly Hadoop 0.19)
and later. The tool operates on files only, it does not need
Hadoop cluster to be running.
</p>
<p>Input formats supported:</p>
<ol>
<li><strong>binary</strong>: native binary format that Hadoop uses internally</li>
<li>
<strong>xml</strong>: XML format, as produced by
<strong>xml</strong> processor, used if filename has xml
(case insensitive) extension
</li>
</ol>
<p>
The Offline Edits Viewer provides several output processors
(unless stated otherwise the output of the processor can be
converted back to original edits file):
</p>
<ol>
<li><strong>binary</strong>: native binary format that Hadoop uses internally</li>
<li><strong>xml</strong>: XML format</li>
<li><strong>stats</strong>: prints out statistics, this cannot be converted back to Edits file</li>
</ol>
</section> <!-- Overview -->
<section>
<title>Usage</title>
<p><code>bash$ bin/hdfs oev -i edits -o edits.xml</code></p>
<table>
<tr><th>Flag</th><th>Description</th></tr>
<tr>
<td><code>[-i|--inputFile] &lt;input file&gt;</code></td>
<td>
Specify the input edits log file to process. Xml (case
insensitive) extension means XML format otherwise binary
format is assumed. Required.
</td>
</tr>
<tr>
<td><code>[-o|--outputFile] &lt;output file&gt;</code></td>
<td>
Specify the output filename, if the specified output processor
generates one. If the specified file already exists, it is
silently overwritten. Required.
</td>
</tr>
<tr>
<td><code>[-p|--processor] &lt;processor&gt;</code></td>
<td>
Specify the image processor to apply against the image
file. Currently valid options are <strong>binary</strong>,
<strong>xml</strong> (default) and <strong>stats</strong>.
</td>
</tr>
<tr>
<td><code>[-v|--verbose]-</code></td>
<td>
Print the input and output filenames and pipe output of
processor to console as well as specified file. On extremely
large files, this may increase processing time by an order
of magnitude.
</td>
</tr>
<tr>
<td><code>[-h|--help]</code></td>
<td>
Display the tool usage and help information and exit.
</td>
</tr>
</table>
</section> <!-- Usage -->
<section>
<title>Case study: Hadoop cluster recovery</title>
<p>
In case there is some problem with hadoop cluster and the edits
file is corrupted it is possible to save at least part of the
edits file that is correct. This can be done by converting the
binary edits to XML, edit it manually and then convert it back
to binary. The most common problem is that the edits file is
missing the closing record (record that has opCode -1). This
should be recognized by the tool and the XML format should be
properly closed.
</p>
<p>
If there is no closing record in the XML file you can add one
after last correct record. Anything after the record with opCode
-1 is ignored.
</p>
<p>Example of a closing record (with opCode -1):</p>
<source>
&lt;RECORD&gt;
&lt;OPCODE&gt;-1&lt;/OPCODE&gt;
&lt;DATA&gt;
&lt;/DATA&gt;
&lt;/RECORD&gt;
</source>
</section> <!-- Case study: Hadoop cluster recovery -->
</body>
</document>

View File

@ -1,427 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>Offline Image Viewer Guide</title>
</header>
<body>
<section>
<title>Overview</title>
<p>The Offline Image Viewer is a tool to dump the contents of hdfs
fsimage files to human-readable formats in order to allow offline analysis
and examination of an Hadoop cluster's namespace. The tool is able to
process very large image files relatively quickly, converting them to
one of several output formats. The tool handles the layout formats that
were included with Hadoop versions 16 and up. If the tool is not able to
process an image file, it will exit cleanly. The Offline Image Viewer does not require
an Hadoop cluster to be running; it is entirely offline in its operation.</p>
<p>The Offline Image Viewer provides several output processors:</p>
<ol>
<li><strong>Ls</strong> is the default output processor. It closely mimics the format of
the <code>lsr </code> command. It includes the same fields, in the same order, as
<code>lsr </code>: directory or file flag, permissions, replication, owner, group,
file size, modification date, and full path. Unlike the <code>lsr </code> command,
the root path is included. One important difference between the output
of the <code>lsr </code> command this processor, is that this output is not sorted
by directory name and contents. Rather, the files are listed in the
order in which they are stored in the fsimage file. Therefore, it is
not possible to directly compare the output of the <code>lsr </code> command this
this tool. The Ls processor uses information contained within the Inode blocks to
calculate file sizes and ignores the <code>-skipBlocks</code> option.</li>
<li><strong>Indented</strong> provides a more complete view of the fsimage's contents,
including all of the information included in the image, such as image
version, generation stamp and inode- and block-specific listings. This
processor uses indentation to organize the output into a hierarchal manner.
The <code>lsr </code> format is suitable for easy human comprehension.</li>
<li><strong>Delimited</strong> provides one file per line consisting of the path,
replication, modification time, access time, block size, number of blocks, file size,
namespace quota, diskspace quota, permissions, username and group name. If run against
an fsimage that does not contain any of these fields, the field's column will be included,
but no data recorded. The default record delimiter is a tab, but this may be changed
via the <code>-delimiter</code> command line argument. This processor is designed to
create output that is easily analyzed by other tools, such as <a href="http://hadoop.apache.org/pig/">Apache Pig</a>.
See the <a href="#analysis">Analyzing Results</a> section
for further information on using this processor to analyze the contents of fsimage files.</li>
<li><strong>XML</strong> creates an XML document of the fsimage and includes all of the
information within the fsimage, similar to the <code>lsr </code> processor. The output
of this processor is amenable to automated processing and analysis with XML tools.
Due to the verbosity of the XML syntax, this processor will also generate
the largest amount of output.</li>
<li><strong>FileDistribution</strong> is the tool for analyzing file
sizes in the namespace image. In order to run the tool one should
define a range of integers <code>[0, maxSize]</code> by specifying
<code>maxSize</code> and a <code>step</code>.
The range of integers is divided into segments of size
<code>step</code>:
<code>[0, s</code><sub>1</sub><code>, ..., s</code><sub>n-1</sub><code>, maxSize]</code>,
and the processor calculates how many files in the system fall into
each segment <code>[s</code><sub>i-1</sub><code>, s</code><sub>i</sub><code>)</code>.
Note that files larger than <code>maxSize</code> always fall into
the very last segment.
The output file is formatted as a tab separated two column table:
Size and NumFiles. Where Size represents the start of the segment,
and numFiles is the number of files form the image which size falls
in this segment.</li>
</ol>
</section> <!-- overview -->
<section>
<title>Usage</title>
<section>
<title>Basic</title>
<p>The simplest usage of the Offline Image Viewer is to provide just an input and output
file, via the <code>-i</code> and <code>-o</code> command-line switches:</p>
<p><code>bash$ bin/hdfs oiv -i fsimage -o fsimage.txt</code><br/></p>
<p>This will create a file named fsimage.txt in the current directory using
the Ls output processor. For very large image files, this process may take
several minutes.</p>
<p>One can specify which output processor via the command-line switch <code>-p</code>.
For instance:</p>
<p><code>bash$ bin/hdfs oiv -i fsimage -o fsimage.xml -p XML</code><br/></p>
<p>or</p>
<p><code>bash$ bin/hdfs oiv -i fsimage -o fsimage.txt -p Indented</code><br/></p>
<p>This will run the tool using either the XML or Indented output processor,
respectively.</p>
<p>One command-line option worth considering is <code>-skipBlocks</code>, which
prevents the tool from explicitly enumerating all of the blocks that make up
a file in the namespace. This is useful for file systems that have very large
files. Enabling this option can significantly decrease the size of the resulting
output, as individual blocks are not included. Note, however, that the Ls processor
needs to enumerate the blocks and so overrides this option.</p>
</section> <!-- Basic -->
<section id="Example">
<title>Example</title>
<p>Consider the following contrived namespace:</p>
<source>
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:17 /anotherDir
-rw-r--r-- 3 theuser supergroup 286631664 2009-03-16 21:15 /anotherDir/biggerfile
-rw-r--r-- 3 theuser supergroup 8754 2009-03-16 21:17 /anotherDir/smallFile
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:11 /mapredsystem
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:11 /mapredsystem/theuser
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem
drwx-wx-wx - theuser supergroup 0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:12 /one
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:12 /one/two
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:16 /user
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:19 /user/theuser
</source>
<p>Applying the Offline Image Processor against this file with default options would result in the following output:</p>
<source>
machine:hadoop-0.21.0-dev theuser$ bin/hdfs oiv -i fsimagedemo -o fsimage.txt
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:16 /
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:17 /anotherDir
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:11 /mapredsystem
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:12 /one
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:16 /user
-rw-r--r-- 3 theuser supergroup 286631664 2009-03-16 14:15 /anotherDir/biggerfile
-rw-r--r-- 3 theuser supergroup 8754 2009-03-16 14:17 /anotherDir/smallFile
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:11 /mapredsystem/theuser
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem
drwx-wx-wx - theuser supergroup 0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:12 /one/two
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:19 /user/theuser
</source>
<p>Similarly, applying the Indented processor would generate output that begins with:</p>
<source>
machine:hadoop-0.21.0-dev theuser$ bin/hdfs oiv -i fsimagedemo -p Indented -o fsimage.txt
FSImage
ImageVersion = -19
NamespaceID = 2109123098
GenerationStamp = 1003
INodes [NumInodes = 12]
Inode
INodePath =
Replication = 0
ModificationTime = 2009-03-16 14:16
AccessTime = 1969-12-31 16:00
BlockSize = 0
Blocks [NumBlocks = -1]
NSQuota = 2147483647
DSQuota = -1
Permissions
Username = theuser
GroupName = supergroup
PermString = rwxr-xr-x
<EFBFBD><EFBFBD><EFBFBD>remaining output omitted<65><64><EFBFBD>
</source>
</section> <!-- example-->
</section>
<section id="options">
<title>Options</title>
<section>
<title>Option Index</title>
<table>
<tr><th> Flag </th><th> Description </th></tr>
<tr><td><code>[-i|--inputFile] &lt;input file&gt;</code></td>
<td>Specify the input fsimage file to process. Required.</td></tr>
<tr><td><code>[-o|--outputFile] &lt;output file&gt;</code></td>
<td>Specify the output filename, if the specified output processor
generates one. If the specified file already exists, it is silently overwritten. Required.
</td></tr>
<tr><td><code>[-p|--processor] &lt;processor&gt;</code></td>
<td>Specify the image processor to apply against the image file. Currently
valid options are Ls (default), XML and Indented..
</td></tr>
<tr><td><code>-skipBlocks</code></td>
<td>Do not enumerate individual blocks within files. This may save processing time
and outfile file space on namespaces with very large files. The <code>Ls</code> processor reads
the blocks to correctly determine file sizes and ignores this option.</td></tr>
<tr><td><code>-printToScreen</code></td>
<td>Pipe output of processor to console as well as specified file. On extremely
large namespaces, this may increase processing time by an order of magnitude.</td></tr>
<tr><td><code>-delimiter &lt;arg&gt;</code></td>
<td>When used in conjunction with the Delimited processor, replaces the default
tab delimiter with the string specified by <code>arg</code>.</td></tr>
<tr><td><code>[-h|--help]</code></td>
<td>Display the tool usage and help information and exit.</td></tr>
</table>
</section> <!-- options -->
</section>
<section id="analysis">
<title>Analyzing Results</title>
<p>The Offline Image Viewer makes it easy to gather large amounts of data about the hdfs namespace.
This information can then be used to explore file system usage patterns or find
specific files that match arbitrary criteria, along with other types of namespace analysis. The Delimited
image processor in particular creates
output that is amenable to further processing by tools such as <a href="http://hadoop.apache.org/pig/">Apache Pig</a>. Pig provides a particularly
good choice for analyzing these data as it is able to deal with the output generated from a small fsimage
but also scales up to consume data from extremely large file systems.</p>
<p>The Delimited image processor generates lines of text separated, by default, by tabs and includes
all of the fields that are common between constructed files and files that were still under constructed
when the fsimage was generated. Examples scripts are provided demonstrating how to use this output to
accomplish three tasks: determine the number of files each user has created on the file system,
find files were created but have not accessed, and find probable duplicates of large files by comparing
the size of each file.</p>
<p>Each of the following scripts assumes you have generated an output file using the Delimited processor named
<code>foo</code> and will be storing the results of the Pig analysis in a file named <code>results</code>.</p>
<section>
<title>Total Number of Files for Each User</title>
<p>This script processes each path within the namespace, groups them by the file owner and determines the total
number of files each user owns.</p>
<p><strong>numFilesOfEachUser.pig:</strong></p>
<source>
-- This script determines the total number of files each user has in
-- the namespace. Its output is of the form:
-- username, totalNumFiles
-- Load all of the fields from the file
A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
replication:int,
modTime:chararray,
accessTime:chararray,
blockSize:long,
numBlocks:int,
fileSize:long,
NamespaceQuota:int,
DiskspaceQuota:int,
perms:chararray,
username:chararray,
groupname:chararray);
-- Grab just the path and username
B = FOREACH A GENERATE path, username;
-- Generate the sum of the number of paths for each user
C = FOREACH (GROUP B BY username) GENERATE group, COUNT(B.path);
-- Save results
STORE C INTO '$outputFile';
</source>
<p>This script can be run against pig with the following command:</p>
<p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../numFilesOfEachUser.pig</code><br/></p>
<p>The output file's content will be similar to that below:</p>
<p>
<code>bart 1</code><br/>
<code>lisa 16</code><br/>
<code>homer 28</code><br/>
<code>marge 2456</code><br/>
</p>
</section>
<section><title>Files That Have Never Been Accessed</title>
<p>This script finds files that were created but whose access times were never changed, meaning they were never opened or viewed.</p>
<p><strong>neverAccessed.pig:</strong></p>
<source>
-- This script generates a list of files that were created but never
-- accessed, based on their AccessTime
-- Load all of the fields from the file
A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
replication:int,
modTime:chararray,
accessTime:chararray,
blockSize:long,
numBlocks:int,
fileSize:long,
NamespaceQuota:int,
DiskspaceQuota:int,
perms:chararray,
username:chararray,
groupname:chararray);
-- Grab just the path and last time the file was accessed
B = FOREACH A GENERATE path, accessTime;
-- Drop all the paths that don't have the default assigned last-access time
C = FILTER B BY accessTime == '1969-12-31 16:00';
-- Drop the accessTimes, since they're all the same
D = FOREACH C GENERATE path;
-- Save results
STORE D INTO '$outputFile';
</source>
<p>This script can be run against pig with the following command and its output file's content will be a list of files that were created but never viewed afterwards.</p>
<p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../neverAccessed.pig</code><br/></p>
</section>
<section><title>Probable Duplicated Files Based on File Size</title>
<p>This script groups files together based on their size, drops any that are of less than 100mb and returns a list of the file size, number of files found and a tuple of the file paths. This can be used to find likely duplicates within the filesystem namespace.</p>
<p><strong>probableDuplicates.pig:</strong></p>
<source>
-- This script finds probable duplicate files greater than 100 MB by
-- grouping together files based on their byte size. Files of this size
-- with exactly the same number of bytes can be considered probable
-- duplicates, but should be checked further, either by comparing the
-- contents directly or by another proxy, such as a hash of the contents.
-- The scripts output is of the type:
-- fileSize numProbableDuplicates {(probableDup1), (probableDup2)}
-- Load all of the fields from the file
A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
replication:int,
modTime:chararray,
accessTime:chararray,
blockSize:long,
numBlocks:int,
fileSize:long,
NamespaceQuota:int,
DiskspaceQuota:int,
perms:chararray,
username:chararray,
groupname:chararray);
-- Grab the pathname and filesize
B = FOREACH A generate path, fileSize;
-- Drop files smaller than 100 MB
C = FILTER B by fileSize > 100L * 1024L * 1024L;
-- Gather all the files of the same byte size
D = GROUP C by fileSize;
-- Generate path, num of duplicates, list of duplicates
E = FOREACH D generate group AS fileSize, COUNT(C) as numDupes, C.path AS files;
-- Drop all the files where there are only one of them
F = FILTER E by numDupes > 1L;
-- Sort by the size of the files
G = ORDER F by fileSize;
-- Save results
STORE G INTO '$outputFile';
</source>
<p>This script can be run against pig with the following command:</p>
<p><code>bin/pig -x local -param inputFile=../foo -param outputFile=../results ../probableDuplicates.pig</code><br/></p>
<p> The output file's content will be similar to that below:</p>
<source>
1077288632 2 {(/user/tennant/work1/part-00501),(/user/tennant/work1/part-00993)}
1077288664 4 {(/user/tennant/work0/part-00567),(/user/tennant/work0/part-03980),(/user/tennant/work1/part-00725),(/user/eccelston/output/part-03395)}
1077288668 3 {(/user/tennant/work0/part-03705),(/user/tennant/work0/part-04242),(/user/tennant/work1/part-03839)}
1077288698 2 {(/user/tennant/work0/part-00435),(/user/eccelston/output/part-01382)}
1077288702 2 {(/user/tennant/work0/part-03864),(/user/eccelston/output/part-03234)}
</source>
<p>Each line includes the file size in bytes that was found to be duplicated, the number of duplicates found, and a list of the duplicated paths.
Files less than 100MB are ignored, providing a reasonable likelihood that files of these exact sizes may be duplicates.</p>
</section>
</section>
</body>
</document>

View File

@ -1,260 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
"http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>
Permissions Guide
</title>
</header>
<body>
<section> <title>Overview</title>
<p>
The Hadoop Distributed File System (HDFS) implements a permissions model for files and directories that shares much of the POSIX model.
Each file and directory is associated with an <em>owner</em> and a <em>group</em>. The file or directory has separate permissions for the
user that is the owner, for other users that are members of the group, and for all other users.
For files, the <em>r</em> permission is required to read the file, and the <em>w</em> permission is required to write or append to the file.
For directories, the <em>r</em> permission is required to list the contents of the directory, the <em>w</em> permission is required to create
or delete files or directories, and the <em>x</em> permission is required to access a child of the directory.
</p>
<p>
In contrast to the POSIX model, there are no <em>setuid</em> or <em>setgid</em> bits for files as there is no notion of executable files.
For directories, there are no <em>setuid</em> or <em>setgid</em> bits directory as a simplification. The <em>Sticky bit</em> can be set
on directories, preventing anyone except the superuser, directory owner or file owner from deleting or moving the files within the directory.
Setting the sticky bit for a file has no effect. Collectively, the permissions of a file or directory are its <em>mode</em>. In general, Unix
customs for representing and displaying modes will be used, including the use of octal numbers in this description. When a file or directory
is created, its owner is the user identity of the client process, and its group is the group of the parent directory (the BSD rule).
</p>
<p>
Each client process that accesses HDFS has a two-part identity composed of the <em>user name</em>, and <em>groups list</em>.
Whenever HDFS must do a permissions check for a file or directory <code>foo</code> accessed by a client process,
</p>
<ul>
<li>
If the user name matches the owner of <code>foo</code>, then the owner permissions are tested;
</li>
<li>
Else if the group of <code>foo</code> matches any of member of the groups list, then the group permissions are tested;
</li>
<li>
Otherwise the other permissions of <code>foo</code> are tested.
</li>
</ul>
<p>
If a permissions check fails, the client operation fails.
</p>
</section>
<section><title>User Identity</title>
<p>
As of Hadoop 0.22, Hadoop supports two different modes of operation to determine the user's identity, specified by the
<code>hadoop.security.authentication</code> property:
</p>
<dl>
<dt><code>simple</code></dt>
<dd>In this mode of operation, the identity of a client process is determined by the host operating system. On Unix-like systems,
the user name is the equivalent of <code>`whoami`</code>.</dd>
<dt><code>kerberos</code></dt>
<dd>In Kerberized operation, the identity of a client process is determined by its Kerberos credentials. For example, in a
Kerberized environment, a user may use the <code>kinit</code> utility to obtain a Kerberos ticket-granting-ticket (TGT) and
use <code>klist</code> to determine their current principal. When mapping a Kerberos principal to an HDFS username, all <em>components</em> except for the <em>primary</em> are dropped. For example, a principal <code>todd/foobar@CORP.COMPANY.COM</code> will act as the simple username <code>todd</code> on HDFS.
</dd>
</dl>
<p>
Regardless of the mode of operation, the user identity mechanism is extrinsic to HDFS itself.
There is no provision within HDFS for creating user identities, establishing groups, or processing user credentials.
</p>
</section>
<section><title>Group Mapping</title>
<p>
Once a username has been determined as described above, the list of groups is
determined by a <em>group mapping service</em>, configured by the
<code>hadoop.security.group.mapping</code> property. Refer to the
core-default.xml for details of the <code>hadoop.security.group.mapping</code>
implementation.
</p>
<p>
An alternate implementation, which connects directly to an LDAP server to resolve the list of groups, is available
via <code>org.apache.hadoop.security.LdapGroupsMapping</code>. However, this provider should only be used if the
required groups reside exclusively in LDAP, and are not materialized on the Unix servers. More information on
configuring the group mapping service is available in the Javadocs.
</p>
<p>
For HDFS, the mapping of users to groups is performed on the NameNode. Thus, the host system configuration of
the NameNode determines the group mappings for the users.
</p>
<p>
Note that HDFS stores the user and group of a file or directory as strings; there is no conversion from user and
group identity numbers as is conventional in Unix.
</p>
</section>
<section> <title>Understanding the Implementation</title>
<p>
Each file or directory operation passes the full path name to the name node, and the permissions checks are applied along the
path for each operation. The client framework will implicitly associate the user identity with the connection to the name node,
reducing the need for changes to the existing client API. It has always been the case that when one operation on a file succeeds,
the operation might fail when repeated because the file, or some directory on the path, no longer exists. For instance, when the
client first begins reading a file, it makes a first request to the name node to discover the location of the first blocks of the file.
A second request made to find additional blocks may fail. On the other hand, deleting a file does not revoke access by a client
that already knows the blocks of the file. With the addition of permissions, a client's access to a file may be withdrawn between
requests. Again, changing permissions does not revoke the access of a client that already knows the file's blocks.
</p>
</section>
<section> <title>Changes to the File System API</title>
<p>
All methods that use a path parameter will throw <code>AccessControlException</code> if permission checking fails.
</p>
<p>New methods:</p>
<ul>
<li>
<code>public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short
replication, long blockSize, Progressable progress) throws IOException;</code>
</li>
<li>
<code>public boolean mkdirs(Path f, FsPermission permission) throws IOException;</code>
</li>
<li>
<code>public void setPermission(Path p, FsPermission permission) throws IOException;</code>
</li>
<li>
<code>public void setOwner(Path p, String username, String groupname) throws IOException;</code>
</li>
<li>
<code>public FileStatus getFileStatus(Path f) throws IOException;</code> will additionally return the user,
group and mode associated with the path.
</li>
</ul>
<p>
The mode of a new file or directory is restricted my the <code>umask</code> set as a configuration parameter.
When the existing <code>create(path, &hellip;)</code> method (<em>without</em> the permission parameter)
is used, the mode of the new file is <code>666&thinsp;&amp;&thinsp;^umask</code>. When the
new <code>create(path, </code><em>permission</em><code>, &hellip;)</code> method
(<em>with</em> the permission parameter <em>P</em>) is used, the mode of the new file is
<code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;666</code>. When a new directory is
created with the existing <code>mkdirs(path)</code> method (<em>without</em> the permission parameter),
the mode of the new directory is <code>777&thinsp;&amp;&thinsp;^umask</code>. When the
new <code>mkdirs(path, </code><em>permission</em> <code>)</code> method (<em>with</em> the
permission parameter <em>P</em>) is used, the mode of new directory is
<code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;777</code>.
</p>
</section>
<section> <title>Changes to the Application Shell</title>
<p>New operations:</p>
<ul>
<li><code>chmod [-R]</code> <em>mode file &hellip;</em>
<br />Only the owner of a file or the super-user is permitted to change the mode of a file.
</li>
<li><code>chgrp [-R]</code> <em>group file &hellip;</em>
<br />The user invoking <code>chgrp</code> must belong to the specified group and be the owner of the file, or be the super-user.
</li>
<li><code>chown [-R]</code> <em>[owner][:[group]] file &hellip;</em>
<br />The owner of a file may only be altered by a super-user.
</li>
<li><code>ls </code> <em>file &hellip;</em>
</li>
<li><code>lsr </code> <em>file &hellip;</em>
<br />The output is reformatted to display the owner, group and mode.
</li>
</ul>
</section>
<section> <title>The Super-User</title>
<p>
The super-user is the user with the same identity as name node process itself. Loosely, if you started the name
node, then you are the super-user. The super-user can do anything in that permissions checks never fail for the
super-user. There is no persistent notion of who <em>was</em> the super-user; when the name node is started
the process identity determines who is the super-user <em>for now</em>. The HDFS super-user does not have
to be the super-user of the name node host, nor is it necessary that all clusters have the same super-user. Also,
an experimenter running HDFS on a personal workstation, conveniently becomes that installation's super-user
without any configuration.
</p>
<p>
In addition, the administrator my identify a distinguished group using a configuration parameter. If set, members
of this group are also super-users.
</p>
</section>
<section> <title>The Web Server</title>
<p>
By default, the identity of the web server is a configuration parameter. That is, the name node has no notion of the identity of
the <em>real</em> user, but the web server behaves as if it has the identity (user and groups) of a user chosen
by the administrator. Unless the chosen identity matches the super-user, parts of the name space may be inaccessible
to the web server.</p>
</section>
<section> <title>Configuration Parameters</title>
<ul>
<li><code>dfs.permissions = true </code>
<br />If <code>yes</code> use the permissions system as described here. If <code>no</code>, permission
<em>checking</em> is turned off, but all other behavior is unchanged. Switching from one parameter
value to the other does not change the mode, owner or group of files or directories.
<br />Regardless of whether permissions are on or off, <code>chmod</code>, <code>chgrp</code> and
<code>chown</code> <em>always</em> check permissions. These functions are only useful in the
permissions context, and so there is no backwards compatibility issue. Furthermore, this allows
administrators to reliably set owners and permissions in advance of turning on regular permissions checking.
</li>
<li><code>dfs.web.ugi = webuser,webgroup</code>
<br />The user name to be used by the web server. Setting this to the name of the super-user allows any
web client to see everything. Changing this to an otherwise unused identity allows web clients to see
only those things visible using "other" permissions. Additional groups may be added to the comma-separated list.
</li>
<li><code>dfs.permissions.superusergroup = supergroup</code>
<br />The name of the group of super-users.
</li>
<li><code>fs.permissions.umask-mode = 022</code>
<br />The <code>umask</code> used when creating files and directories. For configuration files, the decimal
value <em>18<sub>10</sub></em> may be used.
</li>
<li><code>dfs.cluster.administrators = ACL-for-admins></code>
<br />The administrators for the cluster specified as an ACL. This
controls who can access the default servlets, etc. in the
HDFS.
</li>
</ul>
</section>
</body>
</document>

View File

@ -1,113 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header> <title>Quotas Guide</title> </header>
<body>
<section> <title>Overview</title>
<p> The Hadoop Distributed File System (HDFS) allows the <strong>administrator</strong> to set quotas for the number of names used and the
amount of space used for individual directories. Name quotas and space quotas operate independently, but the administration and
implementation of the two types of quotas are closely parallel. </p>
</section>
<section> <title>Name Quotas</title>
<p> The name quota is a hard limit on the number of file and directory names in the tree rooted at that directory. File and
directory creations fail if the quota would be exceeded. Quotas stick with renamed directories; the rename operation fails if
operation would result in a quota violation. The attempt to set a quota will still succeed even if the directory would be in violation of the new
quota. A newly created directory has no associated quota. The largest quota is <code>Long.Max_Value</code>. A quota of one
forces a directory to remain empty. (Yes, a directory counts against its own quota!) </p>
<p> Quotas are persistent with the <code>fsimage</code>. When starting, if the <code>fsimage</code> is immediately in
violation of a quota (perhaps the <code>fsimage</code> was surreptitiously modified),
a warning is printed for each of such violations. Setting or removing a quota creates a journal entry. </p> </section>
<section> <title>Space Quotas</title>
<p> The space quota is a hard limit on the number of bytes used by files in the tree rooted at that directory. Block
allocations fail if the quota would not allow a full block to be written. Each replica of a block counts against the quota. Quotas
stick with renamed directories; the rename operation fails if the operation would result in a quota violation. A newly created directory has no associated quota.
The largest quota is <code>Long.Max_Value</code>. A quota of zero still permits files to be created, but no blocks can be added to the files.
Directories don't use host file system space and don't count against the space quota. The host file system space used to save
the file meta data is not counted against the quota. Quotas are charged at the intended replication factor for the file;
changing the replication factor for a file will credit or debit quotas. </p>
<p> Quotas are persistent with the <code>fsimage</code>. When starting, if the <code>fsimage</code> is immediately in
violation of a quota (perhaps the <code>fsimage</code> was surreptitiously modified), a warning is printed for
each of such violations. Setting or removing a quota creates a journal entry. </p>
</section>
<section>
<title>Administrative Commands</title>
<p> Quotas are managed by a set of commands available only to the administrator. </p>
<ul>
<li> <code>dfsadmin -setQuota &lt;N> &lt;directory>...&lt;directory></code> <br /> Set the name quota to be <code>N</code> for
each directory. Best effort for each directory, with faults reported if <code>N</code> is not a positive long integer, the
directory does not exist or it is a file, or the directory would immediately exceed the new quota. </li>
<li> <code>dfsadmin -clrQuota &lt;directory>...&lt;director></code><br /> Remove any name quota for each directory. Best
effort for each directory, with faults reported if the directory does not exist or it is a file. It is not a fault if the
directory has no quota. </li>
<li> <code>dfsadmin -setSpaceQuota &lt;N> &lt;directory>...&lt;directory></code> <br /> Set the space quota to be
N bytes for each directory. This is a hard limit on total size of all the files under the directory tree.
The space quota takes replication also into account, i.e. one GB of data with replication of 3 consumes 3GB of quota. N can also be specified with a binary prefix for convenience, for e.g. 50g for 50 gigabytes and
2t for 2 terabytes etc. Best effort for each directory, with faults reported if <code>N</code> is
neither zero nor a positive integer, the directory does not exist or it is a file, or the directory would immediately exceed
the new quota. </li>
<li> <code>dfsadmin -clrSpaceQuota &lt;directory>...&lt;director></code><br /> Remove any space quota for each directory. Best
effort for each directory, with faults reported if the directory does not exist or it is a file. It is not a fault if the
directory has no quota. </li>
</ul>
</section>
<section>
<title>Reporting Command</title>
<p> An an extension to the <code>count</code> command of the HDFS shell reports quota values and the current count of names and bytes in use. </p>
<ul>
<li>
<code>fs -count -q &lt;directory>...&lt;directory></code><br /> With the <code>-q</code> option, also report the name quota
value set for each directory, the available name quota remaining, the space quota value set, and the available space quota
remaining. If the directory does not have a quota set, the reported values are <code>none</code> and <code>inf</code>.
</li>
</ul> </section>
</body>
</document>

View File

@ -1,681 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
"http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>
HDFS Users Guide
</title>
</header>
<body>
<section> <title>Purpose</title>
<p>
This document is a starting point for users working with
Hadoop Distributed File System (HDFS) either as a part of a Hadoop cluster
or as a stand-alone general purpose distributed file system.
While HDFS is designed to "just work" in many environments, a working
knowledge of HDFS helps greatly with configuration improvements and
diagnostics on a specific cluster.
</p>
</section>
<section> <title> Overview </title>
<p>
HDFS is the primary distributed storage used by Hadoop applications. A
HDFS cluster primarily consists of a NameNode that manages the
file system metadata and DataNodes that store the actual data. The
<a href="hdfs_design.html">HDFS Architecture Guide</a> describes HDFS in detail. This user guide primarily deals with
the interaction of users and administrators with HDFS clusters.
The <a href="images/hdfsarchitecture.gif">HDFS architecture diagram</a> depicts
basic interactions among NameNode, the DataNodes, and the clients.
Clients contact NameNode for file metadata or file modifications and perform
actual file I/O directly with the DataNodes.
</p>
<p>
The following are some of the salient features that could be of
interest to many users.
</p>
<ul>
<li>
Hadoop, including HDFS, is well suited for distributed storage
and distributed processing using commodity hardware. It is fault
tolerant, scalable, and extremely simple to expand. MapReduce,
well known for its simplicity and applicability for large set of
distributed applications, is an integral part of Hadoop.
</li>
<li>
HDFS is highly configurable with a default configuration well
suited for many installations. Most of the time, configuration
needs to be tuned only for very large clusters.
</li>
<li>
Hadoop is written in Java and is supported on all major platforms.
</li>
<li>
Hadoop supports shell-like commands to interact with HDFS directly.
</li>
<li>
The NameNode and Datanodes have built in web servers that makes it
easy to check current status of the cluster.
</li>
<li>
New features and improvements are regularly implemented in HDFS.
The following is a subset of useful features in HDFS:
<ul>
<li>
File permissions and authentication.
</li>
<li>
<em>Rack awareness</em>: to take a node's physical location into
account while scheduling tasks and allocating storage.
</li>
<li>
Safemode: an administrative mode for maintenance.
</li>
<li>
<code>fsck</code>: a utility to diagnose health of the file system, to
find missing files or blocks.
</li>
<li>
<code>fetchdt</code>: a utility to fetch DelegationToken and store it
in a file on the local system.
</li>
<li>
Rebalancer: tool to balance the cluster when the data is
unevenly distributed among DataNodes.
</li>
<li>
Upgrade and rollback: after a software upgrade,
it is possible to
rollback to HDFS' state before the upgrade in case of unexpected
problems.
</li>
<li>
Secondary NameNode: performs periodic checkpoints of the
namespace and helps keep the size of file containing log of HDFS
modifications within certain limits at the NameNode.
</li>
<li>
Checkpoint node: performs periodic checkpoints of the namespace and
helps minimize the size of the log stored at the NameNode
containing changes to the HDFS.
Replaces the role previously filled by the Secondary NameNode,
though is not yet battle hardened.
The NameNode allows multiple Checkpoint nodes simultaneously,
as long as there are no Backup nodes registered with the system.
</li>
<li>
Backup node: An extension to the Checkpoint node.
In addition to checkpointing it also receives a stream of edits
from the NameNode and maintains its own in-memory copy of the namespace,
which is always in sync with the active NameNode namespace state.
Only one Backup node may be registered with the NameNode at once.
</li>
</ul>
</li>
</ul>
</section> <section> <title> Prerequisites </title>
<p>
The following documents describe how to install and set up a Hadoop cluster:
</p>
<ul>
<li>
<a href="http://hadoop.apache.org/common/docs/current/single_node_setup.html">Single Node Setup</a>
for first-time users.
</li>
<li>
<a href="http://hadoop.apache.org/common/docs/current/cluster_setup.html">Cluster Setup</a>
for large, distributed clusters.
</li>
</ul>
<p>
The rest of this document assumes the user is able to set up and run a
HDFS with at least one DataNode. For the purpose of this document,
both the NameNode and DataNode could be running on the same physical
machine.
</p>
</section> <section> <title> Web Interface </title>
<p>
NameNode and DataNode each run an internal web server in order to
display basic information about the current status of the cluster.
With the default configuration, the NameNode front page is at
<code>http://namenode-name:50070/</code>.
It lists the DataNodes in the cluster and basic statistics of the
cluster. The web interface can also be used to browse the file
system (using "Browse the file system" link on the NameNode front
page).
</p>
</section> <section> <title>Shell Commands</title>
<p>
Hadoop includes various shell-like commands that directly
interact with HDFS and other file systems that Hadoop supports.
The command
<code>bin/hdfs dfs -help</code>
lists the commands supported by Hadoop
shell. Furthermore, the command
<code>bin/hdfs dfs -help command-name</code>
displays more detailed help for a command. These commands support
most of the normal files system operations like copying files,
changing file permissions, etc. It also supports a few HDFS
specific operations like changing replication of files.
For more information see <a href="http://hadoop.apache.org/common/docs/current/file_system_shell.html">File System Shell Guide</a>.
</p>
<section> <title> DFSAdmin Command </title>
<p>
The <code>bin/hadoop dfsadmin</code>
command supports a few HDFS administration related operations.
The <code>bin/hadoop dfsadmin -help</code> command
lists all the commands currently supported. For e.g.:
</p>
<ul>
<li>
<code>-report</code>
: reports basic statistics of HDFS. Some of this information is
also available on the NameNode front page.
</li>
<li>
<code>-safemode</code>
: though usually not required, an administrator can manually enter
or leave Safemode.
</li>
<li>
<code>-finalizeUpgrade</code>
: removes previous backup of the cluster made during last upgrade.
</li>
<li>
<code>-refreshNodes</code>
: Updates the namenode with the set of datanodes allowed to
connect to the namenode. Namenodes re-read datanode hostnames
in the file defined by dfs.hosts, dfs.hosts.exclude. Hosts defined
in dfs.hosts are the datanodes that are part of the cluster.
If there are entries in dfs.hosts, only the hosts in it are
allowed to register with the namenode. Entries in dfs.hosts.exclude
are datanodes that need to be decommissioned. Datanodes complete
decommissioning when all the replicas from them are replicated
to other datanodes. Decommissioned nodes are not automatically
shutdown and are not chosen for writing for new replicas.
</li>
<li>
<code>-printTopology</code>
: Print the topology of the cluster. Display a tree of racks and
datanodes attached to the tracks as viewed by the NameNode.
</li>
</ul>
<p>
For command usage, see
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#dfsadmin">dfsadmin</a>.
</p>
</section>
</section>
<section> <title>Secondary NameNode</title>
<p>
The NameNode stores modifications to the file system as a log
appended to a native file system file, <code>edits</code>.
When a NameNode starts up, it reads HDFS state from an image
file, <code>fsimage</code>, and then applies edits from the
edits log file. It then writes new HDFS state to the <code>fsimage</code>
and starts normal
operation with an empty edits file. Since NameNode merges
<code>fsimage</code> and <code>edits</code> files only during start up,
the edits log file could get very large over time on a busy cluster.
Another side effect of a larger edits file is that next
restart of NameNode takes longer.
</p>
<p>
The secondary NameNode merges the fsimage and the edits log files periodically
and keeps edits log size within a limit. It is usually run on a
different machine than the primary NameNode since its memory requirements
are on the same order as the primary NameNode.
</p>
<p>
The start of the checkpoint process on the secondary NameNode is
controlled by two configuration parameters.
</p>
<ul>
<li>
<code>dfs.namenode.checkpoint.period</code>, set to 1 hour by default, specifies
the maximum delay between two consecutive checkpoints, and
</li>
<li>
<code>dfs.namenode.checkpoint.txns</code>, set to 40000 default, defines the
number of uncheckpointed transactions on the NameNode which will force
an urgent checkpoint, even if the checkpoint period has not been reached.
</li>
</ul>
<p>
The secondary NameNode stores the latest checkpoint in a
directory which is structured the same way as the primary NameNode's
directory. So that the check pointed image is always ready to be
read by the primary NameNode if necessary.
</p>
<p>
For command usage, see
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#secondarynamenode">secondarynamenode</a>.
</p>
</section>
<section> <title> Checkpoint Node </title>
<p>NameNode persists its namespace using two files: <code>fsimage</code>,
which is the latest checkpoint of the namespace and <code>edits</code>,
a journal (log) of changes to the namespace since the checkpoint.
When a NameNode starts up, it merges the <code>fsimage</code> and
<code>edits</code> journal to provide an up-to-date view of the
file system metadata.
The NameNode then overwrites <code>fsimage</code> with the new HDFS state
and begins a new <code>edits</code> journal.
</p>
<p>
The Checkpoint node periodically creates checkpoints of the namespace.
It downloads <code>fsimage</code> and <code>edits</code> from the active
NameNode, merges them locally, and uploads the new image back to the
active NameNode.
The Checkpoint node usually runs on a different machine than the NameNode
since its memory requirements are on the same order as the NameNode.
The Checkpoint node is started by
<code>bin/hdfs namenode -checkpoint</code> on the node
specified in the configuration file.
</p>
<p>The location of the Checkpoint (or Backup) node and its accompanying
web interface are configured via the <code>dfs.namenode.backup.address</code>
and <code>dfs.namenode.backup.http-address</code> configuration variables.
</p>
<p>
The start of the checkpoint process on the Checkpoint node is
controlled by two configuration parameters.
</p>
<ul>
<li>
<code>dfs.namenode.checkpoint.period</code>, set to 1 hour by default, specifies
the maximum delay between two consecutive checkpoints
</li>
<li>
<code>dfs.namenode.checkpoint.txns</code>, set to 40000 default, defines the
number of uncheckpointed transactions on the NameNode which will force
an urgent checkpoint, even if the checkpoint period has not been reached.
</li>
</ul>
<p>
The Checkpoint node stores the latest checkpoint in a
directory that is structured the same as the NameNode's
directory. This allows the checkpointed image to be always available for
reading by the NameNode if necessary.
See <a href="hdfs_user_guide.html#Import+checkpoint">Import checkpoint</a>.
</p>
<p>Multiple checkpoint nodes may be specified in the cluster configuration file.</p>
<p>
For command usage, see
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#namenode">namenode</a>.
</p>
</section>
<section> <title> Backup Node </title>
<p>
The Backup node provides the same checkpointing functionality as the
Checkpoint node, as well as maintaining an in-memory, up-to-date copy of the
file system namespace that is always synchronized with the active NameNode state.
Along with accepting a journal stream of file system edits from
the NameNode and persisting this to disk, the Backup node also applies
those edits into its own copy of the namespace in memory, thus creating
a backup of the namespace.
</p>
<p>
The Backup node does not need to download
<code>fsimage</code> and <code>edits</code> files from the active NameNode
in order to create a checkpoint, as would be required with a
Checkpoint node or Secondary NameNode, since it already has an up-to-date
state of the namespace state in memory.
The Backup node checkpoint process is more efficient as it only needs to
save the namespace into the local <code>fsimage</code> file and reset
<code>edits</code>.
</p>
<p>
As the Backup node maintains a copy of the
namespace in memory, its RAM requirements are the same as the NameNode.
</p>
<p>
The NameNode supports one Backup node at a time. No Checkpoint nodes may be
registered if a Backup node is in use. Using multiple Backup nodes
concurrently will be supported in the future.
</p>
<p>
The Backup node is configured in the same manner as the Checkpoint node.
It is started with <code>bin/hdfs namenode -backup</code>.
</p>
<p>The location of the Backup (or Checkpoint) node and its accompanying
web interface are configured via the <code>dfs.namenode.backup.address</code>
and <code>dfs.namenode.backup.http-address</code> configuration variables.
</p>
<p>
Use of a Backup node provides the option of running the NameNode with no
persistent storage, delegating all responsibility for persisting the state
of the namespace to the Backup node.
To do this, start the NameNode with the
<code>-importCheckpoint</code> option, along with specifying no persistent
storage directories of type edits <code>dfs.namenode.edits.dir</code>
for the NameNode configuration.
</p>
<p>
For a complete discussion of the motivation behind the creation of the
Backup node and Checkpoint node, see
<a href="https://issues.apache.org/jira/browse/HADOOP-4539">HADOOP-4539</a>.
For command usage, see
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#namenode">namenode</a>.
</p>
</section>
<section> <title> Import Checkpoint </title>
<p>
The latest checkpoint can be imported to the NameNode if
all other copies of the image and the edits files are lost.
In order to do that one should:
</p>
<ul>
<li>
Create an empty directory specified in the
<code>dfs.namenode.name.dir</code> configuration variable;
</li>
<li>
Specify the location of the checkpoint directory in the
configuration variable <code>dfs.namenode.checkpoint.dir</code>;
</li>
<li>
and start the NameNode with <code>-importCheckpoint</code> option.
</li>
</ul>
<p>
The NameNode will upload the checkpoint from the
<code>dfs.namenode.checkpoint.dir</code> directory and then save it to the NameNode
directory(s) set in <code>dfs.namenode.name.dir</code>.
The NameNode will fail if a legal image is contained in
<code>dfs.namenode.name.dir</code>.
The NameNode verifies that the image in <code>dfs.namenode.checkpoint.dir</code> is
consistent, but does not modify it in any way.
</p>
<p>
For command usage, see
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#namenode">namenode</a>.
</p>
</section>
<section> <title> Rebalancer </title>
<p>
HDFS data might not always be be placed uniformly across the
DataNode. One common reason is addition of new DataNodes to an
existing cluster. While placing new blocks (data for a file is
stored as a series of blocks), NameNode considers various
parameters before choosing the DataNodes to receive these blocks.
Some of the considerations are:
</p>
<ul>
<li>
Policy to keep one of the replicas of a block on the same node
as the node that is writing the block.
</li>
<li>
Need to spread different replicas of a block across the racks so
that cluster can survive loss of whole rack.
</li>
<li>
One of the replicas is usually placed on the same rack as the
node writing to the file so that cross-rack network I/O is
reduced.
</li>
<li>
Spread HDFS data uniformly across the DataNodes in the cluster.
</li>
</ul>
<p>
Due to multiple competing considerations, data might not be
uniformly placed across the DataNodes.
HDFS provides a tool for administrators that analyzes block
placement and rebalanaces data across the DataNode. A brief
administrator's guide for rebalancer as a
<a href="http://issues.apache.org/jira/secure/attachment/12368261/RebalanceDesign6.pdf">PDF</a>
is attached to
<a href="http://issues.apache.org/jira/browse/HADOOP-1652">HADOOP-1652</a>.
</p>
<p>
For command usage, see
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#balancer">balancer</a>.
</p>
</section> <section> <title> Rack Awareness </title>
<p>
Typically large Hadoop clusters are arranged in racks and
network traffic between different nodes with in the same rack is
much more desirable than network traffic across the racks. In
addition NameNode tries to place replicas of block on
multiple racks for improved fault tolerance. Hadoop lets the
cluster administrators decide which rack a node belongs to
through configuration variable <code>net.topology.script.file.name</code>. When this
script is configured, each node runs the script to determine its
rack id. A default installation assumes all the nodes belong to
the same rack. This feature and configuration is further described
in <a href="http://issues.apache.org/jira/secure/attachment/12345251/Rack_aware_HDFS_proposal.pdf">PDF</a>
attached to
<a href="http://issues.apache.org/jira/browse/HADOOP-692">HADOOP-692</a>.
</p>
</section> <section> <title> Safemode </title>
<p>
During start up the NameNode loads the file system state from the
fsimage and the edits log file. It then waits for DataNodes
to report their blocks so that it does not prematurely start
replicating the blocks though enough replicas already exist in the
cluster. During this time NameNode stays in Safemode.
Safemode
for the NameNode is essentially a read-only mode for the HDFS cluster,
where it does not allow any modifications to file system or blocks.
Normally the NameNode leaves Safemode automatically after the DataNodes
have reported that most file system blocks are available.
If required, HDFS could be placed in Safemode explicitly
using <code>'bin/hadoop dfsadmin -safemode'</code> command. NameNode front
page shows whether Safemode is on or off. A more detailed
description and configuration is maintained as JavaDoc for
<a href="http://hadoop.apache.org/core/docs/current/api/org/apache/hadoop/dfs/NameNode.html#setSafeMode(org.apache.hadoop.dfs.HdfsConstants.SafeModeAction)"><code>setSafeMode()</code></a>.
</p>
</section> <section> <title> fsck </title>
<p>
HDFS supports the <code>fsck</code> command to check for various
inconsistencies.
It it is designed for reporting problems with various
files, for example, missing blocks for a file or under-replicated
blocks. Unlike a traditional <code>fsck</code> utility for native file systems,
this command does not correct the errors it detects. Normally NameNode
automatically corrects most of the recoverable failures. By default
<code>fsck</code> ignores open files but provides an option to select all files during reporting.
The HDFS <code>fsck</code> command is not a
Hadoop shell command. It can be run as '<code>bin/hadoop fsck</code>'.
For command usage, see
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#fsck">fsck</a>.
<code>fsck</code> can be run on the whole file system or on a subset of files.
</p>
</section> <section> <title> fetchdt </title>
<p>
HDFS supports the <code>fetchdt</code> command to fetch Delegation Token
and store it in a file on the local system. This token can be later used to
access secure server (NameNode for example) from a non secure client.
Utility uses either RPC or HTTPS (over Kerberos) to get the token, and thus
requires kerberos tickets to be present before the run (run kinit to get
the tickets).
The HDFS <code>fetchdt</code> command is not a
Hadoop shell command. It can be run as '<code>bin/hadoop fetchdt DTfile </code>'.
After you got the token you can run an HDFS command without having Kerberos
tickets, by pointing HADOOP_TOKEN_FILE_LOCATION environmental variable to
the delegation token file.
For command usage, see <a href="http://hadoop.apache.org/common/docs/current/commands_manual.html#fetchdt"><code>fetchdt</code> command</a>.
</p>
</section>
<section> <title>Recovery Mode</title>
<p>Typically, you will configure multiple metadata storage locations.
Then, if one storage location is corrupt, you can read the
metadata from one of the other storage locations.</p>
<p>However, what can you do if the only storage locations available are
corrupt? In this case, there is a special NameNode startup mode called
Recovery mode that may allow you to recover most of your data.</p>
<p>You can start the NameNode in recovery mode like so:
<code>namenode -recover</code></p>
<p>When in recovery mode, the NameNode will interactively prompt you at
the command line about possible courses of action you can take to
recover your data.</p>
<p>If you don't want to be prompted, you can give the
<code>-force</code> option. This option will force
recovery mode to always select the first choice. Normally, this
will be the most reasonable choice.</p>
<p>Because Recovery mode can cause you to lose data, you should always
back up your edit log and fsimage before using it.</p>
</section>
<section> <title> Upgrade and Rollback </title>
<p>
When Hadoop is upgraded on an existing cluster, as with any
software upgrade, it is possible there are new bugs or
incompatible changes that affect existing applications and were
not discovered earlier. In any non-trivial HDFS installation, it
is not an option to loose any data, let alone to restart HDFS from
scratch. HDFS allows administrators to go back to earlier version
of Hadoop and rollback the cluster to the state it was in
before
the upgrade. HDFS upgrade is described in more detail in
<a href="http://wiki.apache.org/hadoop/Hadoop_Upgrade">Hadoop Upgrade</a> Wiki page.
HDFS can have one such backup at a time. Before upgrading,
administrators need to remove existing backup using <code>bin/hadoop
dfsadmin -finalizeUpgrade</code> command. The following
briefly describes the typical upgrade procedure:
</p>
<ul>
<li>
Before upgrading Hadoop software,
<em>finalize</em> if there an existing backup.
<code>dfsadmin -upgradeProgress status</code>
can tell if the cluster needs to be <em>finalized</em>.
</li>
<li>Stop the cluster and distribute new version of Hadoop.</li>
<li>
Run the new version with <code>-upgrade</code> option
(<code>bin/start-dfs.sh -upgrade</code>).
</li>
<li>
Most of the time, cluster works just fine. Once the new HDFS is
considered working well (may be after a few days of operation),
finalize the upgrade. Note that until the cluster is finalized,
deleting the files that existed before the upgrade does not free
up real disk space on the DataNodes.
</li>
<li>
If there is a need to move back to the old version,
<ul>
<li> stop the cluster and distribute earlier version of Hadoop. </li>
<li> start the cluster with rollback option.
(<code>bin/start-dfs.h -rollback</code>).
</li>
</ul>
</li>
</ul>
</section> <section> <title> File Permissions and Security </title>
<p>
The file permissions are designed to be similar to file permissions on
other familiar platforms like Linux. Currently, security is limited
to simple file permissions. The user that starts NameNode is
treated as the superuser for HDFS. Future versions of HDFS will
support network authentication protocols like Kerberos for user
authentication and encryption of data transfers. The details are discussed in the
<a href="hdfs_permissions_guide.html">Permissions Guide</a>.
</p>
</section> <section> <title> Scalability </title>
<p>
Hadoop currently runs on clusters with thousands of nodes. The
<a href="http://wiki.apache.org/hadoop/PoweredBy">PoweredBy</a> Wiki page
lists some of the organizations that deploy Hadoop on large
clusters. HDFS has one NameNode for each cluster. Currently
the total memory available on NameNode is the primary scalability
limitation. On very large clusters, increasing average size of
files stored in HDFS helps with increasing cluster size without
increasing memory requirements on NameNode.
The default configuration may not suite very large clustes. The
<a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a> Wiki page lists
suggested configuration improvements for large Hadoop clusters.
</p>
</section> <section> <title> Related Documentation </title>
<p>
This user guide is a good starting point for
working with HDFS. While the user guide continues to improve,
there is a large wealth of documentation about Hadoop and HDFS.
The following list is a starting point for further exploration:
</p>
<ul>
<li>
<a href="http://hadoop.apache.org/">Hadoop Site</a>: The home page for the Apache Hadoop site.
</li>
<li>
<a href="http://wiki.apache.org/hadoop/FrontPage">Hadoop Wiki</a>:
The home page (FrontPage) for the Hadoop Wiki. Unlike the released documentation,
which is part of Hadoop source tree, Hadoop Wiki is
regularly edited by Hadoop Community.
</li>
<li> <a href="http://wiki.apache.org/hadoop/FAQ">FAQ</a>:
The FAQ Wiki page.
</li>
<li>
Hadoop <a href="http://hadoop.apache.org/core/docs/current/api/">
JavaDoc API</a>.
</li>
<li>
Hadoop User Mailing List :
<a href="mailto:core-user@hadoop.apache.org">core-user[at]hadoop.apache.org</a>.
</li>
<li>
Explore <code>src/hdfs/hdfs-default.xml</code>.
It includes brief
description of most of the configuration variables available.
</li>
<li>
<a href="http://hadoop.apache.org/common/docs/current/commands_manual.html">Hadoop Commands Guide</a>: Hadoop commands usage.
</li>
</ul>
</section>
</body>
</document>

View File

@ -1,69 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title> HFTP Guide</title>
</header>
<body>
<section>
<title> Introduction </title>
<p> HFTP is a Hadoop filesystem implementation that lets you read data from a remote Hadoop HDFS cluster.
The reads are done via HTTP, and data is sourced from DataNodes.
HFTP is a read-only filesystem, and will throw exceptions if you try to use it to write data or modify
the filesystem state.</p>
<p>HFTP is primarily useful if you have multiple HDFS clusters with different versions and you need to move data from one to another. HFTP is wire-compatible even between different versions of HDFS. For example, you can do things like:
<code>hadoop distcp -i hftp://sourceFS:50070/src hdfs://destFS:50070/dest</code>. Note that HFTP is read-only so the destination must be an HDFS filesystem. (Also, in this example, the <code>distcp</code> should be run using the configuraton of the new filesystem.)</p>
<p>An extension, HSFTP, uses HTTPS by default. This means that data will be encrypted in transit.</p>
</section>
<section>
<title>Implementation</title>
<p>The code for HFTP lives in the Java class <code>org.apache.hadoop.hdfs.HftpFileSystem</code>. Likewise,
HSFTP is implemented in <code>org.apache.hadoop.hdfs.HsftpFileSystem</code>.
</p>
</section>
<section>
<title> Configuration Options </title>
<table>
<tr>
<th>Name</th>
<th>Description</th>
</tr>
<tr>
<td>dfs.hftp.https.port</td>
<td>the HTTPS port on the remote cluster. If not set, HFTP will fall back on
<code>dfs.https.port</code>.</td>
</tr>
<tr>
<td>hdfs.service.host_<strong>ip:port</strong></td>
<td>Specifies the service name (for the security subsystem) associated with the HFTP filesystem
running at <strong>ip:port.</strong></td>
</tr>
</table>
</section>
</body>
</document>

View File

@ -1,110 +0,0 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
"http://forrest.apache.org/dtd/document-v20.dtd">
<document>
<header>
<title>C API libhdfs</title>
<meta name="http-equiv">Content-Type</meta>
<meta name="content">text/html;</meta>
<meta name="charset">utf-8</meta>
</header>
<body>
<section>
<title>Overview</title>
<p>
libhdfs is a JNI based C API for Hadoop's Distributed File System (HDFS).
It provides C APIs to a subset of the HDFS APIs to manipulate HDFS files and
the filesystem. libhdfs is part of the Hadoop distribution and comes
pre-compiled in ${HADOOP_PREFIX}/libhdfs/libhdfs.so .
</p>
</section>
<section>
<title>The APIs</title>
<p>
The libhdfs APIs are a subset of: <a href="api/org/apache/hadoop/fs/FileSystem.html" >hadoop fs APIs</a>.
</p>
<p>
The header file for libhdfs describes each API in detail and is available in ${HADOOP_PREFIX}/src/c++/libhdfs/hdfs.h
</p>
</section>
<section>
<title>A Sample Program</title>
<source>
#include "hdfs.h"
int main(int argc, char **argv) {
hdfsFS fs = hdfsConnect("default", 0);
const char* writePath = "/tmp/testfile.txt";
hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
if(!writeFile) {
fprintf(stderr, "Failed to open %s for writing!\n", writePath);
exit(-1);
}
char* buffer = "Hello, World!";
tSize num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1);
if (hdfsFlush(fs, writeFile)) {
fprintf(stderr, "Failed to 'flush' %s\n", writePath);
exit(-1);
}
hdfsCloseFile(fs, writeFile);
}
</source>
</section>
<section>
<title>How To Link With The Library</title>
<p>
See the Makefile for hdfs_test.c in the libhdfs source directory (${HADOOP_PREFIX}/src/c++/libhdfs/Makefile) or something like:<br />
gcc above_sample.c -I${HADOOP_PREFIX}/src/c++/libhdfs -L${HADOOP_PREFIX}/libhdfs -lhdfs -o above_sample
</p>
</section>
<section>
<title>Common Problems</title>
<p>
The most common problem is the CLASSPATH is not set properly when calling a program that uses libhdfs.
Make sure you set it to all the Hadoop jars needed to run Hadoop itself. Currently, there is no way to
programmatically generate the classpath, but a good bet is to include all the jar files in ${HADOOP_PREFIX}
and ${HADOOP_PREFIX}/lib as well as the right configuration directory containing hdfs-site.xml
</p>
</section>
<section>
<title>Thread Safe</title>
<p>libdhfs is thread safe.</p>
<ul>
<li>Concurrency and Hadoop FS "handles"
<br />The Hadoop FS implementation includes a FS handle cache which caches based on the URI of the
namenode along with the user connecting. So, all calls to hdfsConnect will return the same handle but
calls to hdfsConnectAsUser with different users will return different handles. But, since HDFS client
handles are completely thread safe, this has no bearing on concurrency.
</li>
<li>Concurrency and libhdfs/JNI
<br />The libhdfs calls to JNI should always be creating thread local storage, so (in theory), libhdfs
should be as thread safe as the underlying calls to the Hadoop FS.
</li>
</ul>
</section>
</body>
</document>

View File

@ -311,10 +311,9 @@ public class DistributedFileSystem extends FileSystem {
}
/**
* THIS IS DFS only operations, it is not part of FileSystem
* move blocks from srcs to trg
* Move blocks from srcs to trg
* and delete srcs afterwards
* all blocks should be the same size
* RESTRICTION: all blocks should be the same size
* @param trg existing file to append to
* @param psrcs list of files (same block size, same replication)
* @throws IOException

View File

@ -1333,8 +1333,9 @@ public class Balancer {
// Exit status
enum ReturnStatus {
SUCCESS(1),
IN_PROGRESS(0),
// These int values will map directly to the balancer process's exit code.
SUCCESS(0),
IN_PROGRESS(1),
ALREADY_RUNNING(-1),
NO_MOVE_BLOCK(-2),
NO_MOVE_PROGRESS(-3),
@ -1507,7 +1508,12 @@ public class Balancer {
}
static class Cli extends Configured implements Tool {
/** Parse arguments and then run Balancer */
/**
* Parse arguments and then run Balancer.
*
* @param args command specific arguments.
* @return exit code. 0 indicates success, non-zero indicates failure.
*/
@Override
public int run(String[] args) {
final long startTime = Time.now();

View File

@ -75,6 +75,7 @@ import org.apache.hadoop.hdfs.server.datanode.fsdataset.VolumeChoosingPolicy;
import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
@ -398,13 +399,17 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
final File dstfile = new File(destdir, b.getBlockName());
final File srcmeta = FsDatasetUtil.getMetaFile(srcfile, b.getGenerationStamp());
final File dstmeta = FsDatasetUtil.getMetaFile(dstfile, b.getGenerationStamp());
if (!srcmeta.renameTo(dstmeta)) {
try {
NativeIO.renameTo(srcmeta, dstmeta);
} catch (IOException e) {
throw new IOException("Failed to move meta file for " + b
+ " from " + srcmeta + " to " + dstmeta);
+ " from " + srcmeta + " to " + dstmeta, e);
}
if (!srcfile.renameTo(dstfile)) {
try {
NativeIO.renameTo(srcfile, dstfile);
} catch (IOException e) {
throw new IOException("Failed to move block file for " + b
+ " from " + srcfile + " to " + dstfile.getAbsolutePath());
+ " from " + srcfile + " to " + dstfile.getAbsolutePath(), e);
}
if (LOG.isDebugEnabled()) {
LOG.debug("addBlock: Moved " + srcmeta + " to " + dstmeta
@ -531,10 +536,12 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
if (LOG.isDebugEnabled()) {
LOG.debug("Renaming " + oldmeta + " to " + newmeta);
}
if (!oldmeta.renameTo(newmeta)) {
try {
NativeIO.renameTo(oldmeta, newmeta);
} catch (IOException e) {
throw new IOException("Block " + replicaInfo + " reopen failed. " +
" Unable to move meta file " + oldmeta +
" to rbw dir " + newmeta);
" to rbw dir " + newmeta, e);
}
// rename block file to rbw directory
@ -542,14 +549,18 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
LOG.debug("Renaming " + blkfile + " to " + newBlkFile
+ ", file length=" + blkfile.length());
}
if (!blkfile.renameTo(newBlkFile)) {
if (!newmeta.renameTo(oldmeta)) { // restore the meta file
try {
NativeIO.renameTo(blkfile, newBlkFile);
} catch (IOException e) {
try {
NativeIO.renameTo(newmeta, oldmeta);
} catch (IOException ex) {
LOG.warn("Cannot move meta file " + newmeta +
"back to the finalized directory " + oldmeta);
"back to the finalized directory " + oldmeta, ex);
}
throw new IOException("Block " + replicaInfo + " reopen failed. " +
" Unable to move block file " + blkfile +
" to rbw dir " + newBlkFile);
" to rbw dir " + newBlkFile, e);
}
// Replace finalized replica by a RBW replica in replicas map
@ -656,11 +667,13 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
if (LOG.isDebugEnabled()) {
LOG.debug("Renaming " + oldmeta + " to " + newmeta);
}
if (!oldmeta.renameTo(newmeta)) {
try {
NativeIO.renameTo(oldmeta, newmeta);
} catch (IOException e) {
replicaInfo.setGenerationStamp(oldGS); // restore old GS
throw new IOException("Block " + replicaInfo + " reopen failed. " +
" Unable to move meta file " + oldmeta +
" to " + newmeta);
" to " + newmeta, e);
}
}

View File

@ -70,6 +70,7 @@ import org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.hdfs.web.resources.AccessTimeParam;
import org.apache.hadoop.hdfs.web.resources.BlockSizeParam;
import org.apache.hadoop.hdfs.web.resources.BufferSizeParam;
import org.apache.hadoop.hdfs.web.resources.ConcatSourcesParam;
import org.apache.hadoop.hdfs.web.resources.CreateParentParam;
import org.apache.hadoop.hdfs.web.resources.DelegationParam;
import org.apache.hadoop.hdfs.web.resources.DeleteOpParam;
@ -483,10 +484,12 @@ public class NamenodeWebHdfsMethods {
final DoAsParam doAsUser,
@QueryParam(PostOpParam.NAME) @DefaultValue(PostOpParam.DEFAULT)
final PostOpParam op,
@QueryParam(ConcatSourcesParam.NAME) @DefaultValue(ConcatSourcesParam.DEFAULT)
final ConcatSourcesParam concatSrcs,
@QueryParam(BufferSizeParam.NAME) @DefaultValue(BufferSizeParam.DEFAULT)
final BufferSizeParam bufferSize
) throws IOException, InterruptedException {
return post(ugi, delegation, username, doAsUser, ROOT, op, bufferSize);
return post(ugi, delegation, username, doAsUser, ROOT, op, concatSrcs, bufferSize);
}
/** Handle HTTP POST request. */
@ -505,11 +508,13 @@ public class NamenodeWebHdfsMethods {
@PathParam(UriFsPathParam.NAME) final UriFsPathParam path,
@QueryParam(PostOpParam.NAME) @DefaultValue(PostOpParam.DEFAULT)
final PostOpParam op,
@QueryParam(ConcatSourcesParam.NAME) @DefaultValue(ConcatSourcesParam.DEFAULT)
final ConcatSourcesParam concatSrcs,
@QueryParam(BufferSizeParam.NAME) @DefaultValue(BufferSizeParam.DEFAULT)
final BufferSizeParam bufferSize
) throws IOException, InterruptedException {
init(ugi, delegation, username, doAsUser, path, op, bufferSize);
init(ugi, delegation, username, doAsUser, path, op, concatSrcs, bufferSize);
return ugi.doAs(new PrivilegedExceptionAction<Response>() {
@Override
@ -517,7 +522,7 @@ public class NamenodeWebHdfsMethods {
REMOTE_ADDRESS.set(request.getRemoteAddr());
try {
return post(ugi, delegation, username, doAsUser,
path.getAbsolutePath(), op, bufferSize);
path.getAbsolutePath(), op, concatSrcs, bufferSize);
} finally {
REMOTE_ADDRESS.set(null);
}
@ -532,6 +537,7 @@ public class NamenodeWebHdfsMethods {
final DoAsParam doAsUser,
final String fullpath,
final PostOpParam op,
final ConcatSourcesParam concatSrcs,
final BufferSizeParam bufferSize
) throws IOException, URISyntaxException {
final NameNode namenode = (NameNode)context.getAttribute("name.node");
@ -543,6 +549,11 @@ public class NamenodeWebHdfsMethods {
fullpath, op.getValue(), -1L, -1L, bufferSize);
return Response.temporaryRedirect(uri).type(MediaType.APPLICATION_OCTET_STREAM).build();
}
case CONCAT:
{
namenode.getRpcServer().concat(fullpath, concatSrcs.getAbsolutePaths());
return Response.ok().build();
}
default:
throw new UnsupportedOperationException(op + " is not supported");
}

View File

@ -29,7 +29,9 @@ import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
@ -65,6 +67,7 @@ import org.apache.hadoop.hdfs.server.namenode.SafeModeException;
import org.apache.hadoop.hdfs.web.resources.AccessTimeParam;
import org.apache.hadoop.hdfs.web.resources.BlockSizeParam;
import org.apache.hadoop.hdfs.web.resources.BufferSizeParam;
import org.apache.hadoop.hdfs.web.resources.ConcatSourcesParam;
import org.apache.hadoop.hdfs.web.resources.CreateParentParam;
import org.apache.hadoop.hdfs.web.resources.DeleteOpParam;
import org.apache.hadoop.hdfs.web.resources.DestinationParam;
@ -103,6 +106,7 @@ import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.security.token.TokenRenewer;
import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSelector;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.mortbay.util.ajax.JSON;
import com.google.common.base.Charsets;
@ -716,6 +720,22 @@ public class WebHdfsFileSystem extends FileSystem
};
}
@Override
public void concat(final Path trg, final Path [] psrcs) throws IOException {
statistics.incrementWriteOps(1);
final HttpOpParam.Op op = PostOpParam.Op.CONCAT;
List<String> strPaths = new ArrayList<String>(psrcs.length);
for(Path psrc : psrcs) {
strPaths.add(psrc.toUri().getPath());
}
String srcs = StringUtils.join(",", strPaths);
ConcatSourcesParam param = new ConcatSourcesParam(srcs);
run(op, trg, param);
}
@Override
public FSDataOutputStream create(final Path f, final FsPermission permission,
final boolean overwrite, final int bufferSize, final short replication,

View File

@ -0,0 +1,48 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.web.resources;
/** The concat source paths parameter. */
public class ConcatSourcesParam extends StringParam {
/** Parameter name. */
public static final String NAME = "srcs";
public static final String DEFAULT = NULL;
private static final Domain DOMAIN = new Domain(NAME, null);
/**
* Constructor.
* @param str a string representation of the parameter value.
*/
public ConcatSourcesParam(String str) {
super(DOMAIN, str);
}
@Override
public String getName() {
return NAME;
}
/** @return the absolute path. */
public final String[] getAbsolutePaths() {
final String[] paths = getValue().split(",");
return paths;
}
}

View File

@ -23,13 +23,17 @@ import java.net.HttpURLConnection;
public class PostOpParam extends HttpOpParam<PostOpParam.Op> {
/** Post operations. */
public static enum Op implements HttpOpParam.Op {
APPEND(HttpURLConnection.HTTP_OK),
APPEND(true, HttpURLConnection.HTTP_OK),
NULL(HttpURLConnection.HTTP_NOT_IMPLEMENTED);
CONCAT(false, HttpURLConnection.HTTP_OK),
NULL(false, HttpURLConnection.HTTP_NOT_IMPLEMENTED);
final boolean doOutputAndRedirect;
final int expectedHttpResponseCode;
Op(final int expectedHttpResponseCode) {
Op(final boolean doOutputAndRedirect, final int expectedHttpResponseCode) {
this.doOutputAndRedirect = doOutputAndRedirect;
this.expectedHttpResponseCode = expectedHttpResponseCode;
}
@ -40,12 +44,12 @@ public class PostOpParam extends HttpOpParam<PostOpParam.Op> {
@Override
public boolean getDoOutput() {
return true;
return doOutputAndRedirect;
}
@Override
public boolean getRedirect() {
return true;
return doOutputAndRedirect;
}
@Override

View File

@ -0,0 +1,312 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Fault Injection Framework and Development Guide
---
---
${maven.build.timestamp}
Fault Injection Framework and Development Guide
%{toc|section=1|fromDepth=0}
* Introduction
This guide provides an overview of the Hadoop Fault Injection (FI)
framework for those who will be developing their own faults (aspects).
The idea of fault injection is fairly simple: it is an infusion of
errors and exceptions into an application's logic to achieve a higher
coverage and fault tolerance of the system. Different implementations
of this idea are available today. Hadoop's FI framework is built on top
of Aspect Oriented Paradigm (AOP) implemented by AspectJ toolkit.
* Assumptions
The current implementation of the FI framework assumes that the faults
it will be emulating are of non-deterministic nature. That is, the
moment of a fault's happening isn't known in advance and is a coin-flip
based.
* Architecture of the Fault Injection Framework
Components layout
** Configuration Management
This piece of the FI framework allows you to set expectations for
faults to happen. The settings can be applied either statically (in
advance) or in runtime. The desired level of faults in the framework
can be configured two ways:
* editing src/aop/fi-site.xml configuration file. This file is
similar to other Hadoop's config files
* setting system properties of JVM through VM startup parameters or
in build.properties file
** Probability Model
This is fundamentally a coin flipper. The methods of this class are
getting a random number between 0.0 and 1.0 and then checking if a new
number has happened in the range of 0.0 and a configured level for the
fault in question. If that condition is true then the fault will occur.
Thus, to guarantee the happening of a fault one needs to set an
appropriate level to 1.0. To completely prevent a fault from happening
its probability level has to be set to 0.0.
Note: The default probability level is set to 0 (zero) unless the level
is changed explicitly through the configuration file or in the runtime.
The name of the default level's configuration parameter is fi.*
** Fault Injection Mechanism: AOP and AspectJ
The foundation of Hadoop's FI framework includes a cross-cutting
concept implemented by AspectJ. The following basic terms are important
to remember:
* A cross-cutting concept (aspect) is behavior, and often data, that
is used across the scope of a piece of software
* In AOP, the aspects provide a mechanism by which a cross-cutting
concern can be specified in a modular way
* Advice is the code that is executed when an aspect is invoked
* Join point (or pointcut) is a specific point within the application
that may or not invoke some advice
** Existing Join Points
The following readily available join points are provided by AspectJ:
* Join when a method is called
* Join during a method's execution
* Join when a constructor is invoked
* Join during a constructor's execution
* Join during aspect advice execution
* Join before an object is initialized
* Join during object initialization
* Join during static initializer execution
* Join when a class's field is referenced
* Join when a class's field is assigned
* Join when a handler is executed
* Aspect Example
----
package org.apache.hadoop.hdfs.server.datanode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fi.ProbabilityModel;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.util.DiskChecker.*;
import java.io.IOException;
import java.io.OutputStream;
import java.io.DataOutputStream;
/**
* This aspect takes care about faults injected into datanode.BlockReceiver
* class
*/
public aspect BlockReceiverAspects {
public static final Log LOG = LogFactory.getLog(BlockReceiverAspects.class);
public static final String BLOCK_RECEIVER_FAULT="hdfs.datanode.BlockReceiver";
pointcut callReceivePacket() : call (* OutputStream.write(..))
&& withincode (* BlockReceiver.receivePacket(..))
// to further limit the application of this aspect a very narrow 'target' can be used as follows
// && target(DataOutputStream)
&& !within(BlockReceiverAspects +);
before () throws IOException : callReceivePacket () {
if (ProbabilityModel.injectCriteria(BLOCK_RECEIVER_FAULT)) {
LOG.info("Before the injection point");
Thread.dumpStack();
throw new DiskOutOfSpaceException ("FI: injected fault point at " +
thisJoinPoint.getStaticPart( ).getSourceLocation());
}
}
}
----
The aspect has two main parts:
* The join point pointcut callReceivepacket() which servers as an
identification mark of a specific point (in control and/or data
flow) in the life of an application.
* A call to the advice - before () throws IOException :
callReceivepacket() - will be injected (see Putting It All
Together) before that specific spot of the application's code.
The pointcut identifies an invocation of class' java.io.OutputStream
write() method with any number of parameters and any return type. This
invoke should take place within the body of method receivepacket() from
classBlockReceiver. The method can have any parameters and any return
type. Possible invocations of write() method happening anywhere within
the aspect BlockReceiverAspects or its heirs will be ignored.
Note 1: This short example doesn't illustrate the fact that you can
have more than a single injection point per class. In such a case the
names of the faults have to be different if a developer wants to
trigger them separately.
Note 2: After the injection step (see Putting It All Together) you can
verify that the faults were properly injected by searching for ajc
keywords in a disassembled class file.
* Fault Naming Convention and Namespaces
For the sake of a unified naming convention the following two types of
names are recommended for a new aspects development:
* Activity specific notation (when we don't care about a particular
location of a fault's happening). In this case the name of the
fault is rather abstract: fi.hdfs.DiskError
* Location specific notation. Here, the fault's name is mnemonic as
in: fi.hdfs.datanode.BlockReceiver[optional location details]
* Development Tools
* The Eclipse AspectJ Development Toolkit may help you when
developing aspects
* IntelliJ IDEA provides AspectJ weaver and Spring-AOP plugins
* Putting It All Together
Faults (aspects) have to injected (or woven) together before they can
be used. Follow these instructions:
* To weave aspects in place use:
----
% ant injectfaults
----
* If you misidentified the join point of your aspect you will see a
warning (similar to the one shown here) when 'injectfaults' target
is completed:
----
[iajc] warning at
src/test/aop/org/apache/hadoop/hdfs/server/datanode/ \
BlockReceiverAspects.aj:44::0
advice defined in org.apache.hadoop.hdfs.server.datanode.BlockReceiverAspects
has not been applied [Xlint:adviceDidNotMatch]
----
* It isn't an error, so the build will report the successful result.
To prepare dev.jar file with all your faults weaved in place
(HDFS-475 pending) use:
----
% ant jar-fault-inject
----
* To create test jars use:
----
% ant jar-test-fault-inject
----
* To run HDFS tests with faults injected use:
----
% ant run-test-hdfs-fault-inject
----
** How to Use the Fault Injection Framework
Faults can be triggered as follows:
* During runtime:
----
% ant run-test-hdfs -Dfi.hdfs.datanode.BlockReceiver=0.12
----
To set a certain level, for example 25%, of all injected faults
use:
----
% ant run-test-hdfs-fault-inject -Dfi.*=0.25
----
* From a program:
----
package org.apache.hadoop.fs;
import org.junit.Test;
import org.junit.Before;
public class DemoFiTest {
public static final String BLOCK_RECEIVER_FAULT="hdfs.datanode.BlockReceiver";
@Override
@Before
public void setUp() {
//Setting up the test's environment as required
}
@Test
public void testFI() {
// It triggers the fault, assuming that there's one called 'hdfs.datanode.BlockReceiver'
System.setProperty("fi." + BLOCK_RECEIVER_FAULT, "0.12");
//
// The main logic of your tests goes here
//
// Now set the level back to 0 (zero) to prevent this fault from happening again
System.setProperty("fi." + BLOCK_RECEIVER_FAULT, "0.0");
// or delete its trigger completely
System.getProperties().remove("fi." + BLOCK_RECEIVER_FAULT);
}
@Override
@After
public void tearDown() {
//Cleaning up test test environment
}
}
----
As you can see above these two methods do the same thing. They are
setting the probability level of <<<hdfs.datanode.BlockReceiver>>> at 12%.
The difference, however, is that the program provides more flexibility
and allows you to turn a fault off when a test no longer needs it.
* Additional Information and Contacts
These two sources of information are particularly interesting and worth
reading:
* {{http://www.eclipse.org/aspectj/doc/next/devguide/}}
* AspectJ Cookbook (ISBN-13: 978-0-596-00654-9)
If you have additional comments or questions for the author check
{{{https://issues.apache.org/jira/browse/HDFS-435}HDFS-435}}.

View File

@ -0,0 +1,106 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Offline Edits Viewer Guide
---
Erik Steffl
---
${maven.build.timestamp}
Offline Edits Viewer Guide
\[ {{{./index.html}Go Back}} \]
%{toc|section=1|fromDepth=0}
* Overview
Offline Edits Viewer is a tool to parse the Edits log file. The current
processors are mostly useful for conversion between different formats,
including XML which is human readable and easier to edit than native
binary format.
The tool can parse the edits formats -18 (roughly Hadoop 0.19) and
later. The tool operates on files only, it does not need Hadoop cluster
to be running.
Input formats supported:
[[1]] <<binary>>: native binary format that Hadoop uses internally
[[2]] <<xml>>: XML format, as produced by xml processor, used if filename
has <<<.xml>>> (case insensitive) extension
The Offline Edits Viewer provides several output processors (unless
stated otherwise the output of the processor can be converted back to
original edits file):
[[1]] <<binary>>: native binary format that Hadoop uses internally
[[2]] <<xml>>: XML format
[[3]] <<stats>>: prints out statistics, this cannot be converted back to
Edits file
* Usage
----
bash$ bin/hdfs oev -i edits -o edits.xml
----
*-----------------------:-----------------------------------+
| Flag | Description |
*-----------------------:-----------------------------------+
|[<<<-i>>> ; <<<--inputFile>>>] <input file> | Specify the input edits log file to
| | process. Xml (case insensitive) extension means XML format otherwise
| | binary format is assumed. Required.
*-----------------------:-----------------------------------+
|[<<-o>> ; <<--outputFile>>] <output file> | Specify the output filename, if the
| | specified output processor generates one. If the specified file already
| | exists, it is silently overwritten. Required.
*-----------------------:-----------------------------------+
|[<<-p>> ; <<--processor>>] <processor> | Specify the image processor to apply
| | against the image file. Currently valid options are
| | <<<binary>>>, <<<xml>>> (default) and <<<stats>>>.
*-----------------------:-----------------------------------+
|<<[-v ; --verbose] >> | Print the input and output filenames and pipe output of
| | processor to console as well as specified file. On extremely large
| | files, this may increase processing time by an order of magnitude.
*-----------------------:-----------------------------------+
|<<[-h ; --help] >> | Display the tool usage and help information and exit.
*-----------------------:-----------------------------------+
* Case study: Hadoop cluster recovery
In case there is some problem with hadoop cluster and the edits file is
corrupted it is possible to save at least part of the edits file that
is correct. This can be done by converting the binary edits to XML,
edit it manually and then convert it back to binary. The most common
problem is that the edits file is missing the closing record (record
that has opCode -1). This should be recognized by the tool and the XML
format should be properly closed.
If there is no closing record in the XML file you can add one after
last correct record. Anything after the record with opCode -1 is
ignored.
Example of a closing record (with opCode -1):
+----
<RECORD>
<OPCODE>-1</OPCODE>
<DATA>
</DATA>
</RECORD>
+----

View File

@ -0,0 +1,418 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Offline Image Viewer Guide
---
---
${maven.build.timestamp}
Offline Image Viewer Guide
\[ {{{./index.html}Go Back}} \]
%{toc|section=1|fromDepth=0}
* Overview
The Offline Image Viewer is a tool to dump the contents of hdfs fsimage
files to human-readable formats in order to allow offline analysis and
examination of an Hadoop cluster's namespace. The tool is able to
process very large image files relatively quickly, converting them to
one of several output formats. The tool handles the layout formats that
were included with Hadoop versions 16 and up. If the tool is not able
to process an image file, it will exit cleanly. The Offline Image
Viewer does not require an Hadoop cluster to be running; it is entirely
offline in its operation.
The Offline Image Viewer provides several output processors:
[[1]] Ls is the default output processor. It closely mimics the format of
the lsr command. It includes the same fields, in the same order, as
lsr : directory or file flag, permissions, replication, owner,
group, file size, modification date, and full path. Unlike the lsr
command, the root path is included. One important difference
between the output of the lsr command this processor, is that this
output is not sorted by directory name and contents. Rather, the
files are listed in the order in which they are stored in the
fsimage file. Therefore, it is not possible to directly compare the
output of the lsr command this this tool. The Ls processor uses
information contained within the Inode blocks to calculate file
sizes and ignores the -skipBlocks option.
[[2]] Indented provides a more complete view of the fsimage's contents,
including all of the information included in the image, such as
image version, generation stamp and inode- and block-specific
listings. This processor uses indentation to organize the output
into a hierarchal manner. The lsr format is suitable for easy human
comprehension.
[[3]] Delimited provides one file per line consisting of the path,
replication, modification time, access time, block size, number of
blocks, file size, namespace quota, diskspace quota, permissions,
username and group name. If run against an fsimage that does not
contain any of these fields, the field's column will be included,
but no data recorded. The default record delimiter is a tab, but
this may be changed via the -delimiter command line argument. This
processor is designed to create output that is easily analyzed by
other tools, such as [36]Apache Pig. See the [37]Analyzing Results
section for further information on using this processor to analyze
the contents of fsimage files.
[[4]] XML creates an XML document of the fsimage and includes all of the
information within the fsimage, similar to the lsr processor. The
output of this processor is amenable to automated processing and
analysis with XML tools. Due to the verbosity of the XML syntax,
this processor will also generate the largest amount of output.
[[5]] FileDistribution is the tool for analyzing file sizes in the
namespace image. In order to run the tool one should define a range
of integers [0, maxSize] by specifying maxSize and a step. The
range of integers is divided into segments of size step: [0, s[1],
..., s[n-1], maxSize], and the processor calculates how many files
in the system fall into each segment [s[i-1], s[i]). Note that
files larger than maxSize always fall into the very last segment.
The output file is formatted as a tab separated two column table:
Size and NumFiles. Where Size represents the start of the segment,
and numFiles is the number of files form the image which size falls
in this segment.
* Usage
** Basic
The simplest usage of the Offline Image Viewer is to provide just an
input and output file, via the -i and -o command-line switches:
----
bash$ bin/hdfs oiv -i fsimage -o fsimage.txt
----
This will create a file named fsimage.txt in the current directory
using the Ls output processor. For very large image files, this process
may take several minutes.
One can specify which output processor via the command-line switch -p.
For instance:
----
bash$ bin/hdfs oiv -i fsimage -o fsimage.xml -p XML
----
or
----
bash$ bin/hdfs oiv -i fsimage -o fsimage.txt -p Indented
----
This will run the tool using either the XML or Indented output
processor, respectively.
One command-line option worth considering is -skipBlocks, which
prevents the tool from explicitly enumerating all of the blocks that
make up a file in the namespace. This is useful for file systems that
have very large files. Enabling this option can significantly decrease
the size of the resulting output, as individual blocks are not
included. Note, however, that the Ls processor needs to enumerate the
blocks and so overrides this option.
Example
Consider the following contrived namespace:
----
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:17 /anotherDir
-rw-r--r-- 3 theuser supergroup 286631664 2009-03-16 21:15 /anotherDir/biggerfile
-rw-r--r-- 3 theuser supergroup 8754 2009-03-16 21:17 /anotherDir/smallFile
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:11 /mapredsystem
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:11 /mapredsystem/theuser
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem
drwx-wx-wx - theuser supergroup 0 2009-03-16 21:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:12 /one
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:12 /one/two
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:16 /user
drwxr-xr-x - theuser supergroup 0 2009-03-16 21:19 /user/theuser
----
Applying the Offline Image Processor against this file with default
options would result in the following output:
----
machine:hadoop-0.21.0-dev theuser$ bin/hdfs oiv -i fsimagedemo -o fsimage.txt
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:16 /
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:17 /anotherDir
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:11 /mapredsystem
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:12 /one
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:16 /user
-rw-r--r-- 3 theuser supergroup 286631664 2009-03-16 14:15 /anotherDir/biggerfile
-rw-r--r-- 3 theuser supergroup 8754 2009-03-16 14:17 /anotherDir/smallFile
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:11 /mapredsystem/theuser
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem
drwx-wx-wx - theuser supergroup 0 2009-03-16 14:11 /mapredsystem/theuser/mapredsystem/ip.redacted.com
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:12 /one/two
drwxr-xr-x - theuser supergroup 0 2009-03-16 14:19 /user/theuser
----
Similarly, applying the Indented processor would generate output that
begins with:
----
machine:hadoop-0.21.0-dev theuser$ bin/hdfs oiv -i fsimagedemo -p Indented -o fsimage.txt
FSImage
ImageVersion = -19
NamespaceID = 2109123098
GenerationStamp = 1003
INodes [NumInodes = 12]
Inode
INodePath =
Replication = 0
ModificationTime = 2009-03-16 14:16
AccessTime = 1969-12-31 16:00
BlockSize = 0
Blocks [NumBlocks = -1]
NSQuota = 2147483647
DSQuota = -1
Permissions
Username = theuser
GroupName = supergroup
PermString = rwxr-xr-x
...remaining output omitted...
----
* Options
*-----------------------:-----------------------------------+
| <<Flag>> | <<Description>> |
*-----------------------:-----------------------------------+
| <<<-i>>>\|<<<--inputFile>>> <input file> | Specify the input fsimage file to
| | process. Required.
*-----------------------:-----------------------------------+
| <<<-o>>>\|<<<--outputFile>>> <output file> | Specify the output filename, if the
| | specified output processor generates one. If the specified file already
| | exists, it is silently overwritten. Required.
*-----------------------:-----------------------------------+
| <<<-p>>>\|<<<--processor>>> <processor> | Specify the image processor to apply
| | against the image file. Currently valid options are Ls (default), XML
| | and Indented..
*-----------------------:-----------------------------------+
| <<<-skipBlocks>>> | Do not enumerate individual blocks within files. This may
| | save processing time and outfile file space on namespaces with very
| | large files. The Ls processor reads the blocks to correctly determine
| | file sizes and ignores this option.
*-----------------------:-----------------------------------+
| <<<-printToScreen>>> | Pipe output of processor to console as well as specified
| | file. On extremely large namespaces, this may increase processing time
| | by an order of magnitude.
*-----------------------:-----------------------------------+
| <<<-delimiter>>> <arg>| When used in conjunction with the Delimited processor,
| | replaces the default tab delimiter with the string specified by arg.
*-----------------------:-----------------------------------+
| <<<-h>>>\|<<<--help>>>| Display the tool usage and help information and exit.
*-----------------------:-----------------------------------+
* Analyzing Results
The Offline Image Viewer makes it easy to gather large amounts of data
about the hdfs namespace. This information can then be used to explore
file system usage patterns or find specific files that match arbitrary
criteria, along with other types of namespace analysis. The Delimited
image processor in particular creates output that is amenable to
further processing by tools such as [38]Apache Pig. Pig provides a
particularly good choice for analyzing these data as it is able to deal
with the output generated from a small fsimage but also scales up to
consume data from extremely large file systems.
The Delimited image processor generates lines of text separated, by
default, by tabs and includes all of the fields that are common between
constructed files and files that were still under constructed when the
fsimage was generated. Examples scripts are provided demonstrating how
to use this output to accomplish three tasks: determine the number of
files each user has created on the file system, find files were created
but have not accessed, and find probable duplicates of large files by
comparing the size of each file.
Each of the following scripts assumes you have generated an output file
using the Delimited processor named foo and will be storing the results
of the Pig analysis in a file named results.
** Total Number of Files for Each User
This script processes each path within the namespace, groups them by
the file owner and determines the total number of files each user owns.
----
numFilesOfEachUser.pig:
-- This script determines the total number of files each user has in
-- the namespace. Its output is of the form:
-- username, totalNumFiles
-- Load all of the fields from the file
A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
replication:int,
modTime:chararray,
accessTime:chararray,
blockSize:long,
numBlocks:int,
fileSize:long,
NamespaceQuota:int,
DiskspaceQuota:int,
perms:chararray,
username:chararray,
groupname:chararray);
-- Grab just the path and username
B = FOREACH A GENERATE path, username;
-- Generate the sum of the number of paths for each user
C = FOREACH (GROUP B BY username) GENERATE group, COUNT(B.path);
-- Save results
STORE C INTO '$outputFile';
----
This script can be run against pig with the following command:
----
bin/pig -x local -param inputFile=../foo -param outputFile=../results ../numFilesOfEachUser.pig
----
The output file's content will be similar to that below:
----
bart 1
lisa 16
homer 28
marge 2456
----
** Files That Have Never Been Accessed
This script finds files that were created but whose access times were
never changed, meaning they were never opened or viewed.
----
neverAccessed.pig:
-- This script generates a list of files that were created but never
-- accessed, based on their AccessTime
-- Load all of the fields from the file
A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
replication:int,
modTime:chararray,
accessTime:chararray,
blockSize:long,
numBlocks:int,
fileSize:long,
NamespaceQuota:int,
DiskspaceQuota:int,
perms:chararray,
username:chararray,
groupname:chararray);
-- Grab just the path and last time the file was accessed
B = FOREACH A GENERATE path, accessTime;
-- Drop all the paths that don't have the default assigned last-access time
C = FILTER B BY accessTime == '1969-12-31 16:00';
-- Drop the accessTimes, since they're all the same
D = FOREACH C GENERATE path;
-- Save results
STORE D INTO '$outputFile';
----
This script can be run against pig with the following command and its
output file's content will be a list of files that were created but
never viewed afterwards.
----
bin/pig -x local -param inputFile=../foo -param outputFile=../results ../neverAccessed.pig
----
** Probable Duplicated Files Based on File Size
This script groups files together based on their size, drops any that
are of less than 100mb and returns a list of the file size, number of
files found and a tuple of the file paths. This can be used to find
likely duplicates within the filesystem namespace.
----
probableDuplicates.pig:
-- This script finds probable duplicate files greater than 100 MB by
-- grouping together files based on their byte size. Files of this size
-- with exactly the same number of bytes can be considered probable
-- duplicates, but should be checked further, either by comparing the
-- contents directly or by another proxy, such as a hash of the contents.
-- The scripts output is of the type:
-- fileSize numProbableDuplicates {(probableDup1), (probableDup2)}
-- Load all of the fields from the file
A = LOAD '$inputFile' USING PigStorage('\t') AS (path:chararray,
replication:int,
modTime:chararray,
accessTime:chararray,
blockSize:long,
numBlocks:int,
fileSize:long,
NamespaceQuota:int,
DiskspaceQuota:int,
perms:chararray,
username:chararray,
groupname:chararray);
-- Grab the pathname and filesize
B = FOREACH A generate path, fileSize;
-- Drop files smaller than 100 MB
C = FILTER B by fileSize > 100L * 1024L * 1024L;
-- Gather all the files of the same byte size
D = GROUP C by fileSize;
-- Generate path, num of duplicates, list of duplicates
E = FOREACH D generate group AS fileSize, COUNT(C) as numDupes, C.path AS files;
-- Drop all the files where there are only one of them
F = FILTER E by numDupes > 1L;
-- Sort by the size of the files
G = ORDER F by fileSize;
-- Save results
STORE G INTO '$outputFile';
----
This script can be run against pig with the following command:
----
bin/pig -x local -param inputFile=../foo -param outputFile=../results ../probableDuplicates.pig
----
The output file's content will be similar to that below:
----
1077288632 2 {(/user/tennant/work1/part-00501),(/user/tennant/work1/part-00993)}
1077288664 4 {(/user/tennant/work0/part-00567),(/user/tennant/work0/part-03980),(/user/tennant/work1/part-00725),(/user/eccelston/output/part-03395)}
1077288668 3 {(/user/tennant/work0/part-03705),(/user/tennant/work0/part-04242),(/user/tennant/work1/part-03839)}
1077288698 2 {(/user/tennant/work0/part-00435),(/user/eccelston/output/part-01382)}
1077288702 2 {(/user/tennant/work0/part-03864),(/user/eccelston/output/part-03234)}
----
Each line includes the file size in bytes that was found to be
duplicated, the number of duplicates found, and a list of the
duplicated paths. Files less than 100MB are ignored, providing a
reasonable likelihood that files of these exact sizes may be
duplicates.

View File

@ -0,0 +1,257 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
HDFS Permissions Guide
---
---
${maven.build.timestamp}
HDFS Permissions Guide
\[ {{{./index.html}Go Back}} \]
%{toc|section=1|fromDepth=0}
* Overview
The Hadoop Distributed File System (HDFS) implements a permissions
model for files and directories that shares much of the POSIX model.
Each file and directory is associated with an owner and a group. The
file or directory has separate permissions for the user that is the
owner, for other users that are members of the group, and for all other
users. For files, the r permission is required to read the file, and
the w permission is required to write or append to the file. For
directories, the r permission is required to list the contents of the
directory, the w permission is required to create or delete files or
directories, and the x permission is required to access a child of the
directory.
In contrast to the POSIX model, there are no setuid or setgid bits for
files as there is no notion of executable files. For directories, there
are no setuid or setgid bits directory as a simplification. The Sticky
bit can be set on directories, preventing anyone except the superuser,
directory owner or file owner from deleting or moving the files within
the directory. Setting the sticky bit for a file has no effect.
Collectively, the permissions of a file or directory are its mode. In
general, Unix customs for representing and displaying modes will be
used, including the use of octal numbers in this description. When a
file or directory is created, its owner is the user identity of the
client process, and its group is the group of the parent directory (the
BSD rule).
Each client process that accesses HDFS has a two-part identity composed
of the user name, and groups list. Whenever HDFS must do a permissions
check for a file or directory foo accessed by a client process,
* If the user name matches the owner of foo, then the owner
permissions are tested;
* Else if the group of foo matches any of member of the groups list,
then the group permissions are tested;
* Otherwise the other permissions of foo are tested.
If a permissions check fails, the client operation fails.
* User Identity
As of Hadoop 0.22, Hadoop supports two different modes of operation to
determine the user's identity, specified by the
hadoop.security.authentication property:
* <<simple>>
In this mode of operation, the identity of a client process is
determined by the host operating system. On Unix-like systems,
the user name is the equivalent of `whoami`.
* <<kerberos>>
In Kerberized operation, the identity of a client process is
determined by its Kerberos credentials. For example, in a
Kerberized environment, a user may use the kinit utility to
obtain a Kerberos ticket-granting-ticket (TGT) and use klist to
determine their current principal. When mapping a Kerberos
principal to an HDFS username, all components except for the
primary are dropped. For example, a principal
todd/foobar@CORP.COMPANY.COM will act as the simple username
todd on HDFS.
Regardless of the mode of operation, the user identity mechanism is
extrinsic to HDFS itself. There is no provision within HDFS for
creating user identities, establishing groups, or processing user
credentials.
* Group Mapping
Once a username has been determined as described above, the list of
groups is determined by a group mapping service, configured by the
hadoop.security.group.mapping property. The default implementation,
org.apache.hadoop.security.ShellBasedUnixGroupsMapping, will shell out
to the Unix bash -c groups command to resolve a list of groups for a
user.
An alternate implementation, which connects directly to an LDAP server
to resolve the list of groups, is available via
org.apache.hadoop.security.LdapGroupsMapping. However, this provider
should only be used if the required groups reside exclusively in LDAP,
and are not materialized on the Unix servers. More information on
configuring the group mapping service is available in the Javadocs.
For HDFS, the mapping of users to groups is performed on the NameNode.
Thus, the host system configuration of the NameNode determines the
group mappings for the users.
Note that HDFS stores the user and group of a file or directory as
strings; there is no conversion from user and group identity numbers as
is conventional in Unix.
* Understanding the Implementation
Each file or directory operation passes the full path name to the name
node, and the permissions checks are applied along the path for each
operation. The client framework will implicitly associate the user
identity with the connection to the name node, reducing the need for
changes to the existing client API. It has always been the case that
when one operation on a file succeeds, the operation might fail when
repeated because the file, or some directory on the path, no longer
exists. For instance, when the client first begins reading a file, it
makes a first request to the name node to discover the location of the
first blocks of the file. A second request made to find additional
blocks may fail. On the other hand, deleting a file does not revoke
access by a client that already knows the blocks of the file. With the
addition of permissions, a client's access to a file may be withdrawn
between requests. Again, changing permissions does not revoke the
access of a client that already knows the file's blocks.
* Changes to the File System API
All methods that use a path parameter will throw <<<AccessControlException>>>
if permission checking fails.
New methods:
* <<<public FSDataOutputStream create(Path f, FsPermission permission,
boolean overwrite, int bufferSize, short replication, long
blockSize, Progressable progress) throws IOException;>>>
* <<<public boolean mkdirs(Path f, FsPermission permission) throws
IOException;>>>
* <<<public void setPermission(Path p, FsPermission permission) throws
IOException;>>>
* <<<public void setOwner(Path p, String username, String groupname)
throws IOException;>>>
* <<<public FileStatus getFileStatus(Path f) throws IOException;>>>
will additionally return the user, group and mode associated with the
path.
The mode of a new file or directory is restricted my the umask set as a
configuration parameter. When the existing <<<create(path, …)>>> method
(without the permission parameter) is used, the mode of the new file is
<<<0666 & ^umask>>>. When the new <<<create(path, permission, …)>>> method
(with the permission parameter P) is used, the mode of the new file is
<<<P & ^umask & 0666>>>. When a new directory is created with the existing
<<<mkdirs(path)>>>
method (without the permission parameter), the mode of the new
directory is <<<0777 & ^umask>>>. When the new <<<mkdirs(path, permission)>>>
method (with the permission parameter P) is used, the mode of new
directory is <<<P & ^umask & 0777>>>.
* Changes to the Application Shell
New operations:
* <<<chmod [-R] mode file …>>>
Only the owner of a file or the super-user is permitted to change
the mode of a file.
* <<<chgrp [-R] group file …>>>
The user invoking chgrp must belong to the specified group and be
the owner of the file, or be the super-user.
* <<<chown [-R] [owner][:[group]] file …>>>
The owner of a file may only be altered by a super-user.
* <<<ls file …>>>
* <<<lsr file …>>>
The output is reformatted to display the owner, group and mode.
* The Super-User
The super-user is the user with the same identity as name node process
itself. Loosely, if you started the name node, then you are the
super-user. The super-user can do anything in that permissions checks
never fail for the super-user. There is no persistent notion of who was
the super-user; when the name node is started the process identity
determines who is the super-user for now. The HDFS super-user does not
have to be the super-user of the name node host, nor is it necessary
that all clusters have the same super-user. Also, an experimenter
running HDFS on a personal workstation, conveniently becomes that
installation's super-user without any configuration.
In addition, the administrator my identify a distinguished group using
a configuration parameter. If set, members of this group are also
super-users.
* The Web Server
By default, the identity of the web server is a configuration
parameter. That is, the name node has no notion of the identity of the
real user, but the web server behaves as if it has the identity (user
and groups) of a user chosen by the administrator. Unless the chosen
identity matches the super-user, parts of the name space may be
inaccessible to the web server.
* Configuration Parameters
* <<<dfs.permissions = true>>>
If yes use the permissions system as described here. If no,
permission checking is turned off, but all other behavior is
unchanged. Switching from one parameter value to the other does not
change the mode, owner or group of files or directories.
Regardless of whether permissions are on or off, chmod, chgrp and
chown always check permissions. These functions are only useful in
the permissions context, and so there is no backwards compatibility
issue. Furthermore, this allows administrators to reliably set
owners and permissions in advance of turning on regular permissions
checking.
* <<<dfs.web.ugi = webuser,webgroup>>>
The user name to be used by the web server. Setting this to the
name of the super-user allows any web client to see everything.
Changing this to an otherwise unused identity allows web clients to
see only those things visible using "other" permissions. Additional
groups may be added to the comma-separated list.
* <<<dfs.permissions.superusergroup = supergroup>>>
The name of the group of super-users.
* <<<fs.permissions.umask-mode = 0022>>>
The umask used when creating files and directories. For
configuration files, the decimal value 18 may be used.
* <<<dfs.cluster.administrators = ACL-for-admins>>>
The administrators for the cluster specified as an ACL. This
controls who can access the default servlets, etc. in the HDFS.

View File

@ -0,0 +1,118 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
HDFS Quotas Guide
---
---
${maven.build.timestamp}
HDFS Quotas Guide
\[ {{{./index.html}Go Back}} \]
%{toc|section=1|fromDepth=0}
* Overview
The Hadoop Distributed File System (HDFS) allows the administrator to
set quotas for the number of names used and the amount of space used
for individual directories. Name quotas and space quotas operate
independently, but the administration and implementation of the two
types of quotas are closely parallel.
* Name Quotas
The name quota is a hard limit on the number of file and directory
names in the tree rooted at that directory. File and directory
creations fail if the quota would be exceeded. Quotas stick with
renamed directories; the rename operation fails if operation would
result in a quota violation. The attempt to set a quota will still
succeed even if the directory would be in violation of the new quota. A
newly created directory has no associated quota. The largest quota is
Long.Max_Value. A quota of one forces a directory to remain empty.
(Yes, a directory counts against its own quota!)
Quotas are persistent with the fsimage. When starting, if the fsimage
is immediately in violation of a quota (perhaps the fsimage was
surreptitiously modified), a warning is printed for each of such
violations. Setting or removing a quota creates a journal entry.
* Space Quotas
The space quota is a hard limit on the number of bytes used by files in
the tree rooted at that directory. Block allocations fail if the quota
would not allow a full block to be written. Each replica of a block
counts against the quota. Quotas stick with renamed directories; the
rename operation fails if the operation would result in a quota
violation. A newly created directory has no associated quota. The
largest quota is <<<Long.Max_Value>>>. A quota of zero still permits files
to be created, but no blocks can be added to the files. Directories don't
use host file system space and don't count against the space quota. The
host file system space used to save the file meta data is not counted
against the quota. Quotas are charged at the intended replication
factor for the file; changing the replication factor for a file will
credit or debit quotas.
Quotas are persistent with the fsimage. When starting, if the fsimage
is immediately in violation of a quota (perhaps the fsimage was
surreptitiously modified), a warning is printed for each of such
violations. Setting or removing a quota creates a journal entry.
* Administrative Commands
Quotas are managed by a set of commands available only to the
administrator.
* <<<dfsadmin -setQuota <N> <directory>...<directory> >>>
Set the name quota to be N for each directory. Best effort for each
directory, with faults reported if N is not a positive long
integer, the directory does not exist or it is a file, or the
directory would immediately exceed the new quota.
* <<<dfsadmin -clrQuota <directory>...<directory> >>>
Remove any name quota for each directory. Best effort for each
directory, with faults reported if the directory does not exist or
it is a file. It is not a fault if the directory has no quota.
* <<<dfsadmin -setSpaceQuota <N> <directory>...<directory> >>>
Set the space quota to be N bytes for each directory. This is a
hard limit on total size of all the files under the directory tree.
The space quota takes replication also into account, i.e. one GB of
data with replication of 3 consumes 3GB of quota. N can also be
specified with a binary prefix for convenience, for e.g. 50g for 50
gigabytes and 2t for 2 terabytes etc. Best effort for each
directory, with faults reported if N is neither zero nor a positive
integer, the directory does not exist or it is a file, or the
directory would immediately exceed the new quota.
* <<<dfsadmin -clrSpaceQuota <directory>...<director> >>>
Remove any space quota for each directory. Best effort for each
directory, with faults reported if the directory does not exist or
it is a file. It is not a fault if the directory has no quota.
* Reporting Command
An an extension to the count command of the HDFS shell reports quota
values and the current count of names and bytes in use.
* <<<fs -count -q <directory>...<directory> >>>
With the -q option, also report the name quota value set for each
directory, the available name quota remaining, the space quota
value set, and the available space quota remaining. If the
directory does not have a quota set, the reported values are <<<none>>>
and <<<inf>>>.

View File

@ -0,0 +1,499 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
HDFS Users Guide
---
---
${maven.build.timestamp}
HDFS Users Guide
%{toc|section=1|fromDepth=0}
* Purpose
This document is a starting point for users working with Hadoop
Distributed File System (HDFS) either as a part of a Hadoop cluster or
as a stand-alone general purpose distributed file system. While HDFS is
designed to "just work" in many environments, a working knowledge of
HDFS helps greatly with configuration improvements and diagnostics on a
specific cluster.
* Overview
HDFS is the primary distributed storage used by Hadoop applications. A
HDFS cluster primarily consists of a NameNode that manages the file
system metadata and DataNodes that store the actual data. The HDFS
Architecture Guide describes HDFS in detail. This user guide primarily
deals with the interaction of users and administrators with HDFS
clusters. The HDFS architecture diagram depicts basic interactions
among NameNode, the DataNodes, and the clients. Clients contact
NameNode for file metadata or file modifications and perform actual
file I/O directly with the DataNodes.
The following are some of the salient features that could be of
interest to many users.
* Hadoop, including HDFS, is well suited for distributed storage and
distributed processing using commodity hardware. It is fault
tolerant, scalable, and extremely simple to expand. MapReduce, well
known for its simplicity and applicability for large set of
distributed applications, is an integral part of Hadoop.
* HDFS is highly configurable with a default configuration well
suited for many installations. Most of the time, configuration
needs to be tuned only for very large clusters.
* Hadoop is written in Java and is supported on all major platforms.
* Hadoop supports shell-like commands to interact with HDFS directly.
* The NameNode and Datanodes have built in web servers that makes it
easy to check current status of the cluster.
* New features and improvements are regularly implemented in HDFS.
The following is a subset of useful features in HDFS:
* File permissions and authentication.
* Rack awareness: to take a node's physical location into
account while scheduling tasks and allocating storage.
* Safemode: an administrative mode for maintenance.
* <<<fsck>>>: a utility to diagnose health of the file system, to find
missing files or blocks.
* <<<fetchdt>>>: a utility to fetch DelegationToken and store it in a
file on the local system.
* Rebalancer: tool to balance the cluster when the data is
unevenly distributed among DataNodes.
* Upgrade and rollback: after a software upgrade, it is possible
to rollback to HDFS' state before the upgrade in case of
unexpected problems.
* Secondary NameNode: performs periodic checkpoints of the
namespace and helps keep the size of file containing log of
HDFS modifications within certain limits at the NameNode.
* Checkpoint node: performs periodic checkpoints of the
namespace and helps minimize the size of the log stored at the
NameNode containing changes to the HDFS. Replaces the role
previously filled by the Secondary NameNode, though is not yet
battle hardened. The NameNode allows multiple Checkpoint nodes
simultaneously, as long as there are no Backup nodes
registered with the system.
* Backup node: An extension to the Checkpoint node. In addition
to checkpointing it also receives a stream of edits from the
NameNode and maintains its own in-memory copy of the
namespace, which is always in sync with the active NameNode
namespace state. Only one Backup node may be registered with
the NameNode at once.
* Prerequisites
The following documents describe how to install and set up a Hadoop
cluster:
* {{Single Node Setup}} for first-time users.
* {{Cluster Setup}} for large, distributed clusters.
The rest of this document assumes the user is able to set up and run a
HDFS with at least one DataNode. For the purpose of this document, both
the NameNode and DataNode could be running on the same physical
machine.
* Web Interface
NameNode and DataNode each run an internal web server in order to
display basic information about the current status of the cluster. With
the default configuration, the NameNode front page is at
<<<http://namenode-name:50070/>>>. It lists the DataNodes in the cluster and
basic statistics of the cluster. The web interface can also be used to
browse the file system (using "Browse the file system" link on the
NameNode front page).
* Shell Commands
Hadoop includes various shell-like commands that directly interact with
HDFS and other file systems that Hadoop supports. The command <<<bin/hdfs dfs -help>>>
lists the commands supported by Hadoop shell. Furthermore,
the command <<<bin/hdfs dfs -help command-name>>> displays more detailed help
for a command. These commands support most of the normal files system
operations like copying files, changing file permissions, etc. It also
supports a few HDFS specific operations like changing replication of
files. For more information see {{{File System Shell Guide}}}.
** DFSAdmin Command
The <<<bin/hadoop dfsadmin>>> command supports a few HDFS administration
related operations. The <<<bin/hadoop dfsadmin -help>>> command lists all the
commands currently supported. For e.g.:
* <<<-report>>>: reports basic statistics of HDFS. Some of this
information is also available on the NameNode front page.
* <<<-safemode>>>: though usually not required, an administrator can
manually enter or leave Safemode.
* <<<-finalizeUpgrade>>>: removes previous backup of the cluster made
during last upgrade.
* <<<-refreshNodes>>>: Updates the namenode with the set of datanodes
allowed to connect to the namenode. Namenodes re-read datanode
hostnames in the file defined by <<<dfs.hosts>>>, <<<dfs.hosts.exclude>>>.
Hosts defined in <<<dfs.hosts>>> are the datanodes that are part of the
cluster. If there are entries in <<<dfs.hosts>>>, only the hosts in it
are allowed to register with the namenode. Entries in
<<<dfs.hosts.exclude>>> are datanodes that need to be decommissioned.
Datanodes complete decommissioning when all the replicas from them
are replicated to other datanodes. Decommissioned nodes are not
automatically shutdown and are not chosen for writing for new
replicas.
* <<<-printTopology>>> : Print the topology of the cluster. Display a tree
of racks and datanodes attached to the tracks as viewed by the
NameNode.
For command usage, see {{{dfsadmin}}}.
* Secondary NameNode
The NameNode stores modifications to the file system as a log appended
to a native file system file, edits. When a NameNode starts up, it
reads HDFS state from an image file, fsimage, and then applies edits
from the edits log file. It then writes new HDFS state to the fsimage
and starts normal operation with an empty edits file. Since NameNode
merges fsimage and edits files only during start up, the edits log file
could get very large over time on a busy cluster. Another side effect
of a larger edits file is that next restart of NameNode takes longer.
The secondary NameNode merges the fsimage and the edits log files
periodically and keeps edits log size within a limit. It is usually run
on a different machine than the primary NameNode since its memory
requirements are on the same order as the primary NameNode.
The start of the checkpoint process on the secondary NameNode is
controlled by two configuration parameters.
* <<<dfs.namenode.checkpoint.period>>>, set to 1 hour by default, specifies
the maximum delay between two consecutive checkpoints, and
* <<<dfs.namenode.checkpoint.txns>>>, set to 40000 default, defines the
number of uncheckpointed transactions on the NameNode which will
force an urgent checkpoint, even if the checkpoint period has not
been reached.
The secondary NameNode stores the latest checkpoint in a directory
which is structured the same way as the primary NameNode's directory.
So that the check pointed image is always ready to be read by the
primary NameNode if necessary.
For command usage, see {{{secondarynamenode}}}.
* Checkpoint Node
NameNode persists its namespace using two files: fsimage, which is the
latest checkpoint of the namespace and edits, a journal (log) of
changes to the namespace since the checkpoint. When a NameNode starts
up, it merges the fsimage and edits journal to provide an up-to-date
view of the file system metadata. The NameNode then overwrites fsimage
with the new HDFS state and begins a new edits journal.
The Checkpoint node periodically creates checkpoints of the namespace.
It downloads fsimage and edits from the active NameNode, merges them
locally, and uploads the new image back to the active NameNode. The
Checkpoint node usually runs on a different machine than the NameNode
since its memory requirements are on the same order as the NameNode.
The Checkpoint node is started by bin/hdfs namenode -checkpoint on the
node specified in the configuration file.
The location of the Checkpoint (or Backup) node and its accompanying
web interface are configured via the <<<dfs.namenode.backup.address>>> and
<<<dfs.namenode.backup.http-address>>> configuration variables.
The start of the checkpoint process on the Checkpoint node is
controlled by two configuration parameters.
* <<<dfs.namenode.checkpoint.period>>>, set to 1 hour by default, specifies
the maximum delay between two consecutive checkpoints
* <<<dfs.namenode.checkpoint.txns>>>, set to 40000 default, defines the
number of uncheckpointed transactions on the NameNode which will
force an urgent checkpoint, even if the checkpoint period has not
been reached.
The Checkpoint node stores the latest checkpoint in a directory that is
structured the same as the NameNode's directory. This allows the
checkpointed image to be always available for reading by the NameNode
if necessary. See Import checkpoint.
Multiple checkpoint nodes may be specified in the cluster configuration
file.
For command usage, see {{{namenode}}}.
* Backup Node
The Backup node provides the same checkpointing functionality as the
Checkpoint node, as well as maintaining an in-memory, up-to-date copy
of the file system namespace that is always synchronized with the
active NameNode state. Along with accepting a journal stream of file
system edits from the NameNode and persisting this to disk, the Backup
node also applies those edits into its own copy of the namespace in
memory, thus creating a backup of the namespace.
The Backup node does not need to download fsimage and edits files from
the active NameNode in order to create a checkpoint, as would be
required with a Checkpoint node or Secondary NameNode, since it already
has an up-to-date state of the namespace state in memory. The Backup
node checkpoint process is more efficient as it only needs to save the
namespace into the local fsimage file and reset edits.
As the Backup node maintains a copy of the namespace in memory, its RAM
requirements are the same as the NameNode.
The NameNode supports one Backup node at a time. No Checkpoint nodes
may be registered if a Backup node is in use. Using multiple Backup
nodes concurrently will be supported in the future.
The Backup node is configured in the same manner as the Checkpoint
node. It is started with <<<bin/hdfs namenode -backup>>>.
The location of the Backup (or Checkpoint) node and its accompanying
web interface are configured via the <<<dfs.namenode.backup.address>>> and
<<<dfs.namenode.backup.http-address>>> configuration variables.
Use of a Backup node provides the option of running the NameNode with
no persistent storage, delegating all responsibility for persisting the
state of the namespace to the Backup node. To do this, start the
NameNode with the <<<-importCheckpoint>>> option, along with specifying no
persistent storage directories of type edits <<<dfs.namenode.edits.dir>>> for
the NameNode configuration.
For a complete discussion of the motivation behind the creation of the
Backup node and Checkpoint node, see {{{https://issues.apache.org/jira/browse/HADOOP-4539}HADOOP-4539}}.
For command usage, see {{{namenode}}}.
* Import Checkpoint
The latest checkpoint can be imported to the NameNode if all other
copies of the image and the edits files are lost. In order to do that
one should:
* Create an empty directory specified in the <<<dfs.namenode.name.dir>>>
configuration variable;
* Specify the location of the checkpoint directory in the
configuration variable <<<dfs.namenode.checkpoint.dir>>>;
* and start the NameNode with <<<-importCheckpoint>>> option.
The NameNode will upload the checkpoint from the
<<<dfs.namenode.checkpoint.dir>>> directory and then save it to the NameNode
directory(s) set in <<<dfs.namenode.name.dir>>>. The NameNode will fail if a
legal image is contained in <<<dfs.namenode.name.dir>>>. The NameNode
verifies that the image in <<<dfs.namenode.checkpoint.dir>>> is consistent,
but does not modify it in any way.
For command usage, see {{{namenode}}}.
* Rebalancer
HDFS data might not always be be placed uniformly across the DataNode.
One common reason is addition of new DataNodes to an existing cluster.
While placing new blocks (data for a file is stored as a series of
blocks), NameNode considers various parameters before choosing the
DataNodes to receive these blocks. Some of the considerations are:
* Policy to keep one of the replicas of a block on the same node as
the node that is writing the block.
* Need to spread different replicas of a block across the racks so
that cluster can survive loss of whole rack.
* One of the replicas is usually placed on the same rack as the node
writing to the file so that cross-rack network I/O is reduced.
* Spread HDFS data uniformly across the DataNodes in the cluster.
Due to multiple competing considerations, data might not be uniformly
placed across the DataNodes. HDFS provides a tool for administrators
that analyzes block placement and rebalanaces data across the DataNode.
A brief administrator's guide for rebalancer as a PDF is attached to
{{{https://issues.apache.org/jira/browse/HADOOP-1652}HADOOP-1652}}.
For command usage, see {{{balancer}}}.
* Rack Awareness
Typically large Hadoop clusters are arranged in racks and network
traffic between different nodes with in the same rack is much more
desirable than network traffic across the racks. In addition NameNode
tries to place replicas of block on multiple racks for improved fault
tolerance. Hadoop lets the cluster administrators decide which rack a
node belongs to through configuration variable
<<<net.topology.script.file.name>>>. When this script is configured, each
node runs the script to determine its rack id. A default installation
assumes all the nodes belong to the same rack. This feature and
configuration is further described in PDF attached to
{{{https://issues.apache.org/jira/browse/HADOOP-692}HADOOP-692}}.
* Safemode
During start up the NameNode loads the file system state from the
fsimage and the edits log file. It then waits for DataNodes to report
their blocks so that it does not prematurely start replicating the
blocks though enough replicas already exist in the cluster. During this
time NameNode stays in Safemode. Safemode for the NameNode is
essentially a read-only mode for the HDFS cluster, where it does not
allow any modifications to file system or blocks. Normally the NameNode
leaves Safemode automatically after the DataNodes have reported that
most file system blocks are available. If required, HDFS could be
placed in Safemode explicitly using <<<bin/hadoop dfsadmin -safemode>>>
command. NameNode front page shows whether Safemode is on or off. A
more detailed description and configuration is maintained as JavaDoc
for <<<setSafeMode()>>>.
* fsck
HDFS supports the fsck command to check for various inconsistencies. It
it is designed for reporting problems with various files, for example,
missing blocks for a file or under-replicated blocks. Unlike a
traditional fsck utility for native file systems, this command does not
correct the errors it detects. Normally NameNode automatically corrects
most of the recoverable failures. By default fsck ignores open files
but provides an option to select all files during reporting. The HDFS
fsck command is not a Hadoop shell command. It can be run as
<<<bin/hadoop fsck>>>. For command usage, see {{{fsck}}}. fsck can be run on the
whole file system or on a subset of files.
* fetchdt
HDFS supports the fetchdt command to fetch Delegation Token and store
it in a file on the local system. This token can be later used to
access secure server (NameNode for example) from a non secure client.
Utility uses either RPC or HTTPS (over Kerberos) to get the token, and
thus requires kerberos tickets to be present before the run (run kinit
to get the tickets). The HDFS fetchdt command is not a Hadoop shell
command. It can be run as <<<bin/hadoop fetchdt DTfile>>>. After you got
the token you can run an HDFS command without having Kerberos tickets,
by pointing <<<HADOOP_TOKEN_FILE_LOCATION>>> environmental variable to the
delegation token file. For command usage, see {{{fetchdt}}} command.
* Recovery Mode
Typically, you will configure multiple metadata storage locations.
Then, if one storage location is corrupt, you can read the metadata
from one of the other storage locations.
However, what can you do if the only storage locations available are
corrupt? In this case, there is a special NameNode startup mode called
Recovery mode that may allow you to recover most of your data.
You can start the NameNode in recovery mode like so: <<<namenode -recover>>>
When in recovery mode, the NameNode will interactively prompt you at
the command line about possible courses of action you can take to
recover your data.
If you don't want to be prompted, you can give the <<<-force>>> option. This
option will force recovery mode to always select the first choice.
Normally, this will be the most reasonable choice.
Because Recovery mode can cause you to lose data, you should always
back up your edit log and fsimage before using it.
* Upgrade and Rollback
When Hadoop is upgraded on an existing cluster, as with any software
upgrade, it is possible there are new bugs or incompatible changes that
affect existing applications and were not discovered earlier. In any
non-trivial HDFS installation, it is not an option to loose any data,
let alone to restart HDFS from scratch. HDFS allows administrators to
go back to earlier version of Hadoop and rollback the cluster to the
state it was in before the upgrade. HDFS upgrade is described in more
detail in {{{Hadoop Upgrade}}} Wiki page. HDFS can have one such backup at a
time. Before upgrading, administrators need to remove existing backup
using bin/hadoop dfsadmin <<<-finalizeUpgrade>>> command. The following
briefly describes the typical upgrade procedure:
* Before upgrading Hadoop software, finalize if there an existing
backup. <<<dfsadmin -upgradeProgress>>> status can tell if the cluster
needs to be finalized.
* Stop the cluster and distribute new version of Hadoop.
* Run the new version with <<<-upgrade>>> option (<<<bin/start-dfs.sh -upgrade>>>).
* Most of the time, cluster works just fine. Once the new HDFS is
considered working well (may be after a few days of operation),
finalize the upgrade. Note that until the cluster is finalized,
deleting the files that existed before the upgrade does not free up
real disk space on the DataNodes.
* If there is a need to move back to the old version,
* stop the cluster and distribute earlier version of Hadoop.
* start the cluster with rollback option. (<<<bin/start-dfs.h -rollback>>>).
* File Permissions and Security
The file permissions are designed to be similar to file permissions on
other familiar platforms like Linux. Currently, security is limited to
simple file permissions. The user that starts NameNode is treated as
the superuser for HDFS. Future versions of HDFS will support network
authentication protocols like Kerberos for user authentication and
encryption of data transfers. The details are discussed in the
Permissions Guide.
* Scalability
Hadoop currently runs on clusters with thousands of nodes. The
{{{PoweredBy}}} Wiki page lists some of the organizations that deploy Hadoop
on large clusters. HDFS has one NameNode for each cluster. Currently
the total memory available on NameNode is the primary scalability
limitation. On very large clusters, increasing average size of files
stored in HDFS helps with increasing cluster size without increasing
memory requirements on NameNode. The default configuration may not
suite very large clustes. The {{{FAQ}}} Wiki page lists suggested
configuration improvements for large Hadoop clusters.
* Related Documentation
This user guide is a good starting point for working with HDFS. While
the user guide continues to improve, there is a large wealth of
documentation about Hadoop and HDFS. The following list is a starting
point for further exploration:
* {{{Hadoop Site}}}: The home page for the Apache Hadoop site.
* {{{Hadoop Wiki}}}: The home page (FrontPage) for the Hadoop Wiki. Unlike
the released documentation, which is part of Hadoop source tree,
Hadoop Wiki is regularly edited by Hadoop Community.
* {{{FAQ}}}: The FAQ Wiki page.
* {{{Hadoop JavaDoc API}}}.
* {{{Hadoop User Mailing List}}}: core-user[at]hadoop.apache.org.
* Explore {{{src/hdfs/hdfs-default.xml}}}. It includes brief description of
most of the configuration variables available.
* {{{Hadoop Commands Guide}}}: Hadoop commands usage.

View File

@ -0,0 +1,60 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
HFTP Guide
---
---
${maven.build.timestamp}
HFTP Guide
\[ {{{./index.html}Go Back}} \]
%{toc|section=1|fromDepth=0}
* Introduction
HFTP is a Hadoop filesystem implementation that lets you read data from
a remote Hadoop HDFS cluster. The reads are done via HTTP, and data is
sourced from DataNodes. HFTP is a read-only filesystem, and will throw
exceptions if you try to use it to write data or modify the filesystem
state.
HFTP is primarily useful if you have multiple HDFS clusters with
different versions and you need to move data from one to another. HFTP
is wire-compatible even between different versions of HDFS. For
example, you can do things like: <<<hadoop distcp -i hftp://sourceFS:50070/src hdfs://destFS:50070/dest>>>.
Note that HFTP is read-only so the destination must be an HDFS filesystem.
(Also, in this example, the distcp should be run using the configuraton of
the new filesystem.)
An extension, HSFTP, uses HTTPS by default. This means that data will
be encrypted in transit.
* Implementation
The code for HFTP lives in the Java class
<<<org.apache.hadoop.hdfs.HftpFileSystem>>>. Likewise, HSFTP is implemented
in <<<org.apache.hadoop.hdfs.HsftpFileSystem>>>.
* Configuration Options
*-----------------------:-----------------------------------+
| <<Name>> | <<Description>> |
*-----------------------:-----------------------------------+
| <<<dfs.hftp.https.port>>> | the HTTPS port on the remote cluster. If not set,
| | HFTP will fall back on <<<dfs.https.port>>>.
*-----------------------:-----------------------------------+
| <<<hdfs.service.host_ip:port>>> | Specifies the service name (for the security
| | subsystem) associated with the HFTP filesystem running at ip:port.
*-----------------------:-----------------------------------+

View File

@ -0,0 +1,94 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
C API libhdfs
---
---
${maven.build.timestamp}
C API libhdfs
%{toc|section=1|fromDepth=0}
* Overview
libhdfs is a JNI based C API for Hadoop's Distributed File System
(HDFS). It provides C APIs to a subset of the HDFS APIs to manipulate
HDFS files and the filesystem. libhdfs is part of the Hadoop
distribution and comes pre-compiled in
<<<${HADOOP_PREFIX}/libhdfs/libhdfs.so>>> .
* The APIs
The libhdfs APIs are a subset of: {{{hadoop fs APIs}}}.
The header file for libhdfs describes each API in detail and is
available in <<<${HADOOP_PREFIX}/src/c++/libhdfs/hdfs.h>>>
* A Sample Program
----
\#include "hdfs.h"
int main(int argc, char **argv) {
hdfsFS fs = hdfsConnect("default", 0);
const char* writePath = "/tmp/testfile.txt";
hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
if(!writeFile) {
fprintf(stderr, "Failed to open %s for writing!\n", writePath);
exit(-1);
}
char* buffer = "Hello, World!";
tSize num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1);
if (hdfsFlush(fs, writeFile)) {
fprintf(stderr, "Failed to 'flush' %s\n", writePath);
exit(-1);
}
hdfsCloseFile(fs, writeFile);
}
----
* How To Link With The Library
See the Makefile for <<<hdfs_test.c>>> in the libhdfs source directory
(<<<${HADOOP_PREFIX}/src/c++/libhdfs/Makefile>>>) or something like:
<<<gcc above_sample.c -I${HADOOP_PREFIX}/src/c++/libhdfs -L${HADOOP_PREFIX}/libhdfs -lhdfs -o above_sample>>>
* Common Problems
The most common problem is the <<<CLASSPATH>>> is not set properly when
calling a program that uses libhdfs. Make sure you set it to all the
Hadoop jars needed to run Hadoop itself. Currently, there is no way to
programmatically generate the classpath, but a good bet is to include
all the jar files in <<<${HADOOP_PREFIX}>>> and <<<${HADOOP_PREFIX}/lib>>> as well
as the right configuration directory containing <<<hdfs-site.xml>>>
* Thread Safe
libdhfs is thread safe.
* Concurrency and Hadoop FS "handles"
The Hadoop FS implementation includes a FS handle cache which
caches based on the URI of the namenode along with the user
connecting. So, all calls to <<<hdfsConnect>>> will return the same
handle but calls to <<<hdfsConnectAsUser>>> with different users will
return different handles. But, since HDFS client handles are
completely thread safe, this has no bearing on concurrency.
* Concurrency and libhdfs/JNI
The libhdfs calls to JNI should always be creating thread local
storage, so (in theory), libhdfs should be as thread safe as the
underlying calls to the Hadoop FS.

View File

@ -0,0 +1,195 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Synthetic Load Generator Guide
---
---
${maven.build.timestamp}
Synthetic Load Generator Guide
%{toc|section=1|fromDepth=0}
* Overview
The synthetic load generator (SLG) is a tool for testing NameNode
behavior under different client loads. The user can generate different
mixes of read, write, and list requests by specifying the probabilities
of read and write. The user controls the intensity of the load by
adjusting parameters for the number of worker threads and the delay
between operations. While load generators are running, the user can
profile and monitor the running of the NameNode. When a load generator
exits, it prints some NameNode statistics like the average execution
time of each kind of operation and the NameNode throughput.
* Synopsis
The synopsis of the command is:
----
java LoadGenerator [options]
----
Options include:
* <<<-readProbability>>> <read probability>
The probability of the read operation; default is 0.3333.
* <<<-writeProbability>>> <write probability>
The probability of the write operations; default is 0.3333.
* <<<-root>>> <test space root>
The root of the test space; default is /testLoadSpace.
* <<<-maxDelayBetweenOps>>> <maxDelayBetweenOpsInMillis>
The maximum delay between two consecutive operations in a thread;
default is 0 indicating no delay.
* <<<-numOfThreads>>> <numOfThreads>
The number of threads to spawn; default is 200.
* <<<-elapsedTime>>> <elapsedTimeInSecs>
The number of seconds that the program will run; A value of zero
indicates that the program runs forever. The default value is 0.
* <<<-startTime>>> <startTimeInMillis>
The time that all worker threads start to run. By default it is 10
seconds after the main program starts running.This creates a
barrier if more than one load generator is running.
* <<<-seed>>> <seed>
The random generator seed for repeating requests to NameNode when
running with a single thread; default is the current time.
After command line argument parsing, the load generator traverses the
test space and builds a table of all directories and another table of
all files in the test space. It then waits until the start time to
spawn the number of worker threads as specified by the user. Each
thread sends a stream of requests to NameNode. At each iteration, it
first decides if it is going to read a file, create a file, or list a
directory following the read and write probabilities specified by the
user. The listing probability is equal to 1-read probability-write
probability. When reading, it randomly picks a file in the test space
and reads the entire file. When writing, it randomly picks a directory
in the test space and creates a file there.
To avoid two threads with the same load generator or from two different
load generators creating the same file, the file name consists of the
current machine's host name and the thread id. The length of the file
follows Gaussian distribution with an average size of 2 blocks and the
standard deviation of 1. The new file is filled with byte 'a'. To avoid
the test space growing indefinitely, the file is deleted immediately
after the file creation completes. While listing, it randomly picks a
directory in the test space and lists its content.
After an operation completes, the thread pauses for a random amount of
time in the range of [0, maxDelayBetweenOps] if the specified maximum
delay is not zero. All threads are stopped when the specified elapsed
time is passed. Before exiting, the program prints the average
execution for each kind of NameNode operations, and the number of
requests served by the NameNode per second.
* Test Space Population
The user needs to populate a test space before running a load
generator. The structure generator generates a random test space
structure and the data generator creates the files and directories of
the test space in Hadoop distributed file system.
** Structure Generator
This tool generates a random namespace structure with the following
constraints:
[[1]] The number of subdirectories that a directory can have is a random
number in [minWidth, maxWidth].
[[2]] The maximum depth of each subdirectory is a random number
[2*maxDepth/3, maxDepth].
[[3]] Files are randomly placed in leaf directories. The size of each
file follows Gaussian distribution with an average size of 1 block
and a standard deviation of 1.
The generated namespace structure is described by two files in the
output directory. Each line of the first file contains the full name of
a leaf directory. Each line of the second file contains the full name
of a file and its size, separated by a blank.
The synopsis of the command is:
----
java StructureGenerator [options]
----
Options include:
* <<<-maxDepth>>> <maxDepth>
Maximum depth of the directory tree; default is 5.
* <<<-minWidth>>> <minWidth>
Minimum number of subdirectories per directories; default is 1.
* <<<-maxWidth>>> <maxWidth>
Maximum number of subdirectories per directories; default is 5.
* <<<-numOfFiles>>> <#OfFiles>
The total number of files in the test space; default is 10.
* <<<-avgFileSize>>> <avgFileSizeInBlocks>
Average size of blocks; default is 1.
* <<<-outDir>>> <outDir>
Output directory; default is the current directory.
* <<<-seed>>> <seed>
Random number generator seed; default is the current time.
** Data Generator
This tool reads the directory structure and file structure from the
input directory and creates the namespace in Hadoop distributed file
system. All files are filled with byte 'a'.
The synopsis of the command is:
----
java DataGenerator [options]
----
Options include:
* <<<-inDir>>> <inDir>
Input directory name where directory/file structures are stored;
default is the current directory.
* <<<-root>>> <test space root>
The name of the root directory which the new namespace is going to
be placed under; default is "/testLoadSpace".

View File

@ -46,8 +46,10 @@ import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.balancer.Balancer.Cli;
import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.Tool;
import org.junit.Test;
/**
@ -95,7 +97,6 @@ public class TestBalancer {
DFSTestUtil.waitReplication(fs, filePath, replicationFactor);
}
/* fill up a cluster with <code>numNodes</code> datanodes
* whose used space to be <code>size</code>
*/
@ -301,10 +302,12 @@ public class TestBalancer {
* @param racks - array of racks for original nodes in cluster
* @param newCapacity - new node's capacity
* @param newRack - new node's rack
* @param useTool - if true run test via Cli with command-line argument
* parsing, etc. Otherwise invoke balancer API directly.
* @throws Exception
*/
private void doTest(Configuration conf, long[] capacities, String[] racks,
long newCapacity, String newRack) throws Exception {
long newCapacity, String newRack, boolean useTool) throws Exception {
assertEquals(capacities.length, racks.length);
int numOfDatanodes = capacities.length;
cluster = new MiniDFSCluster.Builder(conf)
@ -330,7 +333,11 @@ public class TestBalancer {
totalCapacity += newCapacity;
// run balancer and validate results
runBalancer(conf, totalUsedSpace, totalCapacity);
if (useTool) {
runBalancerCli(conf, totalUsedSpace, totalCapacity);
} else {
runBalancer(conf, totalUsedSpace, totalCapacity);
}
} finally {
cluster.shutdown();
}
@ -350,22 +357,38 @@ public class TestBalancer {
waitForBalancer(totalUsedSpace, totalCapacity, client, cluster);
}
private void runBalancerCli(Configuration conf,
long totalUsedSpace, long totalCapacity) throws Exception {
waitForHeartBeat(totalUsedSpace, totalCapacity, client, cluster);
final String[] args = { "-policy", "datanode" };
final Tool tool = new Cli();
tool.setConf(conf);
final int r = tool.run(args); // start rebalancing
assertEquals("Tools should exit 0 on success", 0, r);
waitForHeartBeat(totalUsedSpace, totalCapacity, client, cluster);
LOG.info("Rebalancing with default ctor.");
waitForBalancer(totalUsedSpace, totalCapacity, client, cluster);
}
/** one-node cluster test*/
private void oneNodeTest(Configuration conf) throws Exception {
private void oneNodeTest(Configuration conf, boolean useTool) throws Exception {
// add an empty node with half of the CAPACITY & the same rack
doTest(conf, new long[]{CAPACITY}, new String[]{RACK0}, CAPACITY/2, RACK0);
doTest(conf, new long[]{CAPACITY}, new String[]{RACK0}, CAPACITY/2,
RACK0, useTool);
}
/** two-node cluster test */
private void twoNodeTest(Configuration conf) throws Exception {
doTest(conf, new long[]{CAPACITY, CAPACITY}, new String[]{RACK0, RACK1},
CAPACITY, RACK2);
CAPACITY, RACK2, false);
}
/** test using a user-supplied conf */
public void integrationTest(Configuration conf) throws Exception {
initConf(conf);
oneNodeTest(conf);
oneNodeTest(conf, false);
}
/**
@ -401,7 +424,7 @@ public class TestBalancer {
void testBalancer0Internal(Configuration conf) throws Exception {
initConf(conf);
oneNodeTest(conf);
oneNodeTest(conf, false);
twoNodeTest(conf);
}
@ -495,7 +518,18 @@ public class TestBalancer {
}
/**
* Verify balancer exits 0 on success.
*/
@Test(timeout=100000)
public void testExitZeroOnSuccess() throws Exception {
final Configuration conf = new HdfsConfiguration();
initConf(conf);
oneNodeTest(conf, true);
}
/**
* @param args
*/

View File

@ -27,11 +27,13 @@ import java.security.PrivilegedExceptionAction;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSMainOperationsBaseTest;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileSystemTestHelper;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.datanode.web.resources.DatanodeWebHdfsMethods;
import org.apache.hadoop.hdfs.web.resources.ExceptionHandler;
@ -60,6 +62,7 @@ public class TestFSMainOperationsWebHdfs extends FSMainOperationsBaseTest {
final Configuration conf = new Configuration();
conf.setBoolean(DFSConfigKeys.DFS_WEBHDFS_ENABLED_KEY, true);
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build();
cluster.waitActive();
@ -101,6 +104,30 @@ public class TestFSMainOperationsWebHdfs extends FSMainOperationsBaseTest {
return defaultWorkingDirectory;
}
@Test
public void testConcat() throws Exception {
Path[] paths = {new Path("/test/hadoop/file1"),
new Path("/test/hadoop/file2"),
new Path("/test/hadoop/file3")};
DFSTestUtil.createFile(fSys, paths[0], 1024, (short) 3, 0);
DFSTestUtil.createFile(fSys, paths[1], 1024, (short) 3, 0);
DFSTestUtil.createFile(fSys, paths[2], 1024, (short) 3, 0);
Path catPath = new Path("/test/hadoop/catFile");
DFSTestUtil.createFile(fSys, catPath, 1024, (short) 3, 0);
Assert.assertTrue(exists(fSys, catPath));
fSys.concat(catPath, paths);
Assert.assertFalse(exists(fSys, paths[0]));
Assert.assertFalse(exists(fSys, paths[1]));
Assert.assertFalse(exists(fSys, paths[2]));
FileStatus fileStatus = fSys.getFileStatus(catPath);
Assert.assertEquals(1024*4, fileStatus.getLen());
}
@Override
@Test
public void testMkdirsFailsForSubdirectoryOfExistingFile() throws Exception {

View File

@ -217,6 +217,9 @@ Release 2.0.3-alpha - Unreleased
OPTIMIZATIONS
MAPREDUCE-4893. Fixed MR ApplicationMaster to do optimal assignment of
containers to get maximum locality. (Bikas Saha via vinodkv)
BUG FIXES
MAPREDUCE-4607. Race condition in ReduceTask completion can result in Task
@ -278,6 +281,9 @@ Release 2.0.3-alpha - Unreleased
MAPREDUCE-2264. Job status exceeds 100% in some cases.
(devaraj.k and sandyr via tucu)
MAPREDUCE-4969. TestKeyValueTextInputFormat test fails with Open JDK 7.
(Arpit Agarwal via suresh)
Release 2.0.2-alpha - 2012-09-07
INCOMPATIBLE CHANGES

View File

@ -747,7 +747,7 @@ public class RMContainerAllocator extends RMContainerRequestor
addContainerReq(req);
}
@SuppressWarnings("unchecked")
// this method will change the list of allocatedContainers.
private void assign(List<Container> allocatedContainers) {
Iterator<Container> it = allocatedContainers.iterator();
LOG.info("Got allocated containers " + allocatedContainers.size());
@ -788,84 +788,97 @@ public class RMContainerAllocator extends RMContainerRequestor
+ reduces.isEmpty());
isAssignable = false;
}
}
} else {
LOG.warn("Container allocated at unwanted priority: " + priority +
". Returning to RM...");
isAssignable = false;
}
boolean blackListed = false;
ContainerRequest assigned = null;
if(!isAssignable) {
// release container if we could not assign it
containerNotAssigned(allocated);
it.remove();
continue;
}
if (isAssignable) {
// do not assign if allocated container is on a
// blacklisted host
String allocatedHost = allocated.getNodeId().getHost();
blackListed = isNodeBlacklisted(allocatedHost);
if (blackListed) {
// we need to request for a new container
// and release the current one
LOG.info("Got allocated container on a blacklisted "
+ " host "+allocatedHost
+". Releasing container " + allocated);
// do not assign if allocated container is on a
// blacklisted host
String allocatedHost = allocated.getNodeId().getHost();
if (isNodeBlacklisted(allocatedHost)) {
// we need to request for a new container
// and release the current one
LOG.info("Got allocated container on a blacklisted "
+ " host "+allocatedHost
+". Releasing container " + allocated);
// find the request matching this allocated container
// and replace it with a new one
ContainerRequest toBeReplacedReq =
getContainerReqToReplace(allocated);
if (toBeReplacedReq != null) {
LOG.info("Placing a new container request for task attempt "
+ toBeReplacedReq.attemptID);
ContainerRequest newReq =
getFilteredContainerRequest(toBeReplacedReq);
decContainerReq(toBeReplacedReq);
if (toBeReplacedReq.attemptID.getTaskId().getTaskType() ==
TaskType.MAP) {
maps.put(newReq.attemptID, newReq);
}
else {
reduces.put(newReq.attemptID, newReq);
}
addContainerReq(newReq);
// find the request matching this allocated container
// and replace it with a new one
ContainerRequest toBeReplacedReq =
getContainerReqToReplace(allocated);
if (toBeReplacedReq != null) {
LOG.info("Placing a new container request for task attempt "
+ toBeReplacedReq.attemptID);
ContainerRequest newReq =
getFilteredContainerRequest(toBeReplacedReq);
decContainerReq(toBeReplacedReq);
if (toBeReplacedReq.attemptID.getTaskId().getTaskType() ==
TaskType.MAP) {
maps.put(newReq.attemptID, newReq);
}
else {
LOG.info("Could not map allocated container to a valid request."
+ " Releasing allocated container " + allocated);
reduces.put(newReq.attemptID, newReq);
}
addContainerReq(newReq);
}
else {
assigned = assign(allocated);
if (assigned != null) {
// Update resource requests
decContainerReq(assigned);
// send the container-assigned event to task attempt
eventHandler.handle(new TaskAttemptContainerAssignedEvent(
assigned.attemptID, allocated, applicationACLs));
assignedRequests.add(allocated, assigned.attemptID);
if (LOG.isDebugEnabled()) {
LOG.info("Assigned container (" + allocated + ") "
+ " to task " + assigned.attemptID + " on node "
+ allocated.getNodeId().toString());
}
}
else {
//not assigned to any request, release the container
LOG.info("Releasing unassigned and invalid container "
+ allocated + ". RM has gone crazy, someone go look!"
+ " Hey RM, if you are so rich, go donate to non-profits!");
}
LOG.info("Could not map allocated container to a valid request."
+ " Releasing allocated container " + allocated);
}
// release container if we could not assign it
containerNotAssigned(allocated);
it.remove();
continue;
}
// release container if it was blacklisted
// or if we could not assign it
if (blackListed || assigned == null) {
containersReleased++;
release(allocated.getId());
}
}
assignContainers(allocatedContainers);
// release container if we could not assign it
it = allocatedContainers.iterator();
while (it.hasNext()) {
Container allocated = it.next();
LOG.info("Releasing unassigned and invalid container "
+ allocated + ". RM may have assignment issues");
containerNotAssigned(allocated);
}
}
private ContainerRequest assign(Container allocated) {
@SuppressWarnings("unchecked")
private void containerAssigned(Container allocated,
ContainerRequest assigned) {
// Update resource requests
decContainerReq(assigned);
// send the container-assigned event to task attempt
eventHandler.handle(new TaskAttemptContainerAssignedEvent(
assigned.attemptID, allocated, applicationACLs));
assignedRequests.add(allocated, assigned.attemptID);
if (LOG.isDebugEnabled()) {
LOG.info("Assigned container (" + allocated + ") "
+ " to task " + assigned.attemptID + " on node "
+ allocated.getNodeId().toString());
}
}
private void containerNotAssigned(Container allocated) {
containersReleased++;
release(allocated.getId());
}
private ContainerRequest assignWithoutLocality(Container allocated) {
ContainerRequest assigned = null;
Priority priority = allocated.getPriority();
@ -877,18 +890,24 @@ public class RMContainerAllocator extends RMContainerRequestor
LOG.debug("Assigning container " + allocated + " to reduce");
}
assigned = assignToReduce(allocated);
} else if (PRIORITY_MAP.equals(priority)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Assigning container " + allocated + " to map");
}
assigned = assignToMap(allocated);
} else {
LOG.warn("Container allocated at unwanted priority: " + priority +
". Returning to RM...");
}
return assigned;
}
private void assignContainers(List<Container> allocatedContainers) {
Iterator<Container> it = allocatedContainers.iterator();
while (it.hasNext()) {
Container allocated = it.next();
ContainerRequest assigned = assignWithoutLocality(allocated);
if (assigned != null) {
containerAssigned(allocated, assigned);
it.remove();
}
}
assignMapsWithLocality(allocatedContainers);
}
private ContainerRequest getContainerReqToReplace(Container allocated) {
LOG.info("Finding containerReq for allocated container: " + allocated);
@ -959,11 +978,15 @@ public class RMContainerAllocator extends RMContainerRequestor
}
@SuppressWarnings("unchecked")
private ContainerRequest assignToMap(Container allocated) {
//try to assign to maps if present
//first by host, then by rack, followed by *
ContainerRequest assigned = null;
while (assigned == null && maps.size() > 0) {
private void assignMapsWithLocality(List<Container> allocatedContainers) {
// try to assign to all nodes first to match node local
Iterator<Container> it = allocatedContainers.iterator();
while(it.hasNext() && maps.size() > 0){
Container allocated = it.next();
Priority priority = allocated.getPriority();
assert PRIORITY_MAP.equals(priority);
// "if (maps.containsKey(tId))" below should be almost always true.
// hence this while loop would almost always have O(1) complexity
String host = allocated.getNodeId().getHost();
LinkedList<TaskAttemptId> list = mapsHostMapping.get(host);
while (list != null && list.size() > 0) {
@ -972,7 +995,9 @@ public class RMContainerAllocator extends RMContainerRequestor
}
TaskAttemptId tId = list.removeFirst();
if (maps.containsKey(tId)) {
assigned = maps.remove(tId);
ContainerRequest assigned = maps.remove(tId);
containerAssigned(allocated, assigned);
it.remove();
JobCounterUpdateEvent jce =
new JobCounterUpdateEvent(assigned.attemptID.getTaskId().getJobId());
jce.addCounterUpdate(JobCounter.DATA_LOCAL_MAPS, 1);
@ -984,39 +1009,56 @@ public class RMContainerAllocator extends RMContainerRequestor
break;
}
}
if (assigned == null) {
String rack = RackResolver.resolve(host).getNetworkLocation();
list = mapsRackMapping.get(rack);
while (list != null && list.size() > 0) {
TaskAttemptId tId = list.removeFirst();
if (maps.containsKey(tId)) {
assigned = maps.remove(tId);
JobCounterUpdateEvent jce =
new JobCounterUpdateEvent(assigned.attemptID.getTaskId().getJobId());
jce.addCounterUpdate(JobCounter.RACK_LOCAL_MAPS, 1);
eventHandler.handle(jce);
rackLocalAssigned++;
if (LOG.isDebugEnabled()) {
LOG.debug("Assigned based on rack match " + rack);
}
break;
}
}
if (assigned == null && maps.size() > 0) {
TaskAttemptId tId = maps.keySet().iterator().next();
assigned = maps.remove(tId);
}
// try to match all rack local
it = allocatedContainers.iterator();
while(it.hasNext() && maps.size() > 0){
Container allocated = it.next();
Priority priority = allocated.getPriority();
assert PRIORITY_MAP.equals(priority);
// "if (maps.containsKey(tId))" below should be almost always true.
// hence this while loop would almost always have O(1) complexity
String host = allocated.getNodeId().getHost();
String rack = RackResolver.resolve(host).getNetworkLocation();
LinkedList<TaskAttemptId> list = mapsRackMapping.get(rack);
while (list != null && list.size() > 0) {
TaskAttemptId tId = list.removeFirst();
if (maps.containsKey(tId)) {
ContainerRequest assigned = maps.remove(tId);
containerAssigned(allocated, assigned);
it.remove();
JobCounterUpdateEvent jce =
new JobCounterUpdateEvent(assigned.attemptID.getTaskId().getJobId());
jce.addCounterUpdate(JobCounter.OTHER_LOCAL_MAPS, 1);
jce.addCounterUpdate(JobCounter.RACK_LOCAL_MAPS, 1);
eventHandler.handle(jce);
rackLocalAssigned++;
if (LOG.isDebugEnabled()) {
LOG.debug("Assigned based on * match");
LOG.debug("Assigned based on rack match " + rack);
}
break;
}
}
}
return assigned;
// assign remaining
it = allocatedContainers.iterator();
while(it.hasNext() && maps.size() > 0){
Container allocated = it.next();
Priority priority = allocated.getPriority();
assert PRIORITY_MAP.equals(priority);
TaskAttemptId tId = maps.keySet().iterator().next();
ContainerRequest assigned = maps.remove(tId);
containerAssigned(allocated, assigned);
it.remove();
JobCounterUpdateEvent jce =
new JobCounterUpdateEvent(assigned.attemptID.getTaskId().getJobId());
jce.addCounterUpdate(JobCounter.OTHER_LOCAL_MAPS, 1);
eventHandler.handle(jce);
if (LOG.isDebugEnabled()) {
LOG.debug("Assigned based on * match");
}
}
}
}

View File

@ -190,6 +190,92 @@ public class TestRMContainerAllocator {
checkAssignments(new ContainerRequestEvent[] { event1, event2, event3 },
assigned, false);
}
@Test
public void testMapNodeLocality() throws Exception {
// test checks that ordering of allocated containers list from the RM does
// not affect the map->container assignment done by the AM. If there is a
// node local container available for a map then it should be assigned to
// that container and not a rack-local container that happened to be seen
// earlier in the allocated containers list from the RM.
// Regression test for MAPREDUCE-4893
LOG.info("Running testMapNodeLocality");
Configuration conf = new Configuration();
MyResourceManager rm = new MyResourceManager(conf);
rm.start();
DrainDispatcher dispatcher = (DrainDispatcher) rm.getRMContext()
.getDispatcher();
// Submit the application
RMApp app = rm.submitApp(1024);
dispatcher.await();
MockNM amNodeManager = rm.registerNode("amNM:1234", 2048);
amNodeManager.nodeHeartbeat(true);
dispatcher.await();
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt()
.getAppAttemptId();
rm.sendAMLaunched(appAttemptId);
dispatcher.await();
JobId jobId = MRBuilderUtils.newJobId(appAttemptId.getApplicationId(), 0);
Job mockJob = mock(Job.class);
when(mockJob.getReport()).thenReturn(
MRBuilderUtils.newJobReport(jobId, "job", "user", JobState.RUNNING, 0,
0, 0, 0, 0, 0, 0, "jobfile", null, false, ""));
MyContainerAllocator allocator = new MyContainerAllocator(rm, conf,
appAttemptId, mockJob);
// add resources to scheduler
MockNM nodeManager1 = rm.registerNode("h1:1234", 3072); // can assign 2 maps
rm.registerNode("h2:1234", 10240); // wont heartbeat on node local node
MockNM nodeManager3 = rm.registerNode("h3:1234", 1536); // assign 1 map
dispatcher.await();
// create the container requests for maps
ContainerRequestEvent event1 = createReq(jobId, 1, 1024,
new String[] { "h1" });
allocator.sendRequest(event1);
ContainerRequestEvent event2 = createReq(jobId, 2, 1024,
new String[] { "h1" });
allocator.sendRequest(event2);
ContainerRequestEvent event3 = createReq(jobId, 3, 1024,
new String[] { "h2" });
allocator.sendRequest(event3);
// this tells the scheduler about the requests
// as nodes are not added, no allocations
List<TaskAttemptContainerAssignedEvent> assigned = allocator.schedule();
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, assigned.size());
// update resources in scheduler
// Node heartbeat from rack-local first. This makes node h3 the first in the
// list of allocated containers but it should not be assigned to task1.
nodeManager3.nodeHeartbeat(true);
// Node heartbeat from node-local next. This allocates 2 node local
// containers for task1 and task2. These should be matched with those tasks.
nodeManager1.nodeHeartbeat(true);
dispatcher.await();
assigned = allocator.schedule();
dispatcher.await();
checkAssignments(new ContainerRequestEvent[] { event1, event2, event3 },
assigned, false);
// remove the rack-local assignment that should have happened for task3
for(TaskAttemptContainerAssignedEvent event : assigned) {
if(event.getTaskAttemptID().equals(event3.getAttemptID())) {
assigned.remove(event);
Assert.assertTrue(
event.getContainer().getNodeId().getHost().equals("h3"));
break;
}
}
checkAssignments(new ContainerRequestEvent[] { event1, event2},
assigned, true);
}
@Test
public void testResource() throws Exception {
@ -1202,7 +1288,7 @@ public class TestRMContainerAllocator {
if (checkHostMatch) {
Assert.assertTrue("Not assigned to requested host", Arrays.asList(
request.getHosts()).contains(
assigned.getContainer().getNodeId().toString()));
assigned.getContainer().getNodeId().getHost()));
}
}

View File

@ -136,32 +136,47 @@ public class TestKeyValueTextInputFormat extends TestCase {
}
public void testUTF8() throws Exception {
LineReader in = makeStream("abcd\u20acbdcd\u20ac");
Text line = new Text();
in.readLine(line);
assertEquals("readLine changed utf8 characters",
"abcd\u20acbdcd\u20ac", line.toString());
in = makeStream("abc\u200axyz");
in.readLine(line);
assertEquals("split on fake newline", "abc\u200axyz", line.toString());
LineReader in = null;
try {
in = makeStream("abcd\u20acbdcd\u20ac");
Text line = new Text();
in.readLine(line);
assertEquals("readLine changed utf8 characters",
"abcd\u20acbdcd\u20ac", line.toString());
in = makeStream("abc\u200axyz");
in.readLine(line);
assertEquals("split on fake newline", "abc\u200axyz", line.toString());
} finally {
if (in != null) {
in.close();
}
}
}
public void testNewLines() throws Exception {
LineReader in = makeStream("a\nbb\n\nccc\rdddd\r\neeeee");
Text out = new Text();
in.readLine(out);
assertEquals("line1 length", 1, out.getLength());
in.readLine(out);
assertEquals("line2 length", 2, out.getLength());
in.readLine(out);
assertEquals("line3 length", 0, out.getLength());
in.readLine(out);
assertEquals("line4 length", 3, out.getLength());
in.readLine(out);
assertEquals("line5 length", 4, out.getLength());
in.readLine(out);
assertEquals("line5 length", 5, out.getLength());
assertEquals("end of file", 0, in.readLine(out));
LineReader in = null;
try {
in = makeStream("a\nbb\n\nccc\rdddd\r\neeeee");
Text out = new Text();
in.readLine(out);
assertEquals("line1 length", 1, out.getLength());
in.readLine(out);
assertEquals("line2 length", 2, out.getLength());
in.readLine(out);
assertEquals("line3 length", 0, out.getLength());
in.readLine(out);
assertEquals("line4 length", 3, out.getLength());
in.readLine(out);
assertEquals("line5 length", 4, out.getLength());
in.readLine(out);
assertEquals("line5 length", 5, out.getLength());
assertEquals("end of file", 0, in.readLine(out));
} finally {
if (in != null) {
in.close();
}
}
}
private static void writeFile(FileSystem fs, Path name,
@ -183,14 +198,21 @@ public class TestKeyValueTextInputFormat extends TestCase {
InputSplit split,
JobConf job) throws IOException {
List<Text> result = new ArrayList<Text>();
RecordReader<Text, Text> reader = format.getRecordReader(split, job,
voidReporter);
Text key = reader.createKey();
Text value = reader.createValue();
while (reader.next(key, value)) {
result.add(value);
value = reader.createValue();
}
RecordReader<Text, Text> reader = null;
try {
reader = format.getRecordReader(split, job, voidReporter);
Text key = reader.createKey();
Text value = reader.createValue();
while (reader.next(key, value)) {
result.add(value);
value = (Text) reader.createValue();
}
} finally {
if (reader != null) {
reader.close();
}
}
return result;
}

View File

@ -281,6 +281,9 @@ Release 0.23.7 - UNRELEASED
BUG FIXES
YARN-343. Capacity Scheduler maximum-capacity value -1 is invalid (Xuan
Gong via tgraves)
Release 0.23.6 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -99,6 +99,9 @@ public class CapacitySchedulerConfiguration extends Configuration {
@Private
public static final float MAXIMUM_CAPACITY_VALUE = 100;
@Private
public static final float DEFAULT_MAXIMUM_CAPACITY_VALUE = -1.0f;
@Private
public static final int DEFAULT_USER_LIMIT = 100;
@ -206,6 +209,8 @@ public class CapacitySchedulerConfiguration extends Configuration {
public float getMaximumCapacity(String queue) {
float maxCapacity = getFloat(getQueuePrefix(queue) + MAXIMUM_CAPACITY,
MAXIMUM_CAPACITY_VALUE);
maxCapacity = (maxCapacity == DEFAULT_MAXIMUM_CAPACITY_VALUE) ?
MAXIMUM_CAPACITY_VALUE : maxCapacity;
return maxCapacity;
}

View File

@ -243,6 +243,18 @@ public class TestCapacityScheduler {
LOG.info("Setup top-level queues a and b");
}
@Test
public void testMaximumCapacitySetup() {
float delta = 0.0000001f;
CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
assertEquals(CapacitySchedulerConfiguration.MAXIMUM_CAPACITY_VALUE,conf.getMaximumCapacity(A),delta);
conf.setMaximumCapacity(A, 50.0f);
assertEquals(50.0f, conf.getMaximumCapacity(A),delta);
conf.setMaximumCapacity(A, -1);
assertEquals(CapacitySchedulerConfiguration.MAXIMUM_CAPACITY_VALUE,conf.getMaximumCapacity(A),delta);
}
@Test
public void testRefreshQueues() throws Exception {
CapacityScheduler cs = new CapacityScheduler();