diff --git a/lib/commons-cli-2.0-SNAPSHOT.jar b/lib/commons-cli-2.0-SNAPSHOT.jar new file mode 100644 index 00000000000..0b1d51072a7 Binary files /dev/null and b/lib/commons-cli-2.0-SNAPSHOT.jar differ diff --git a/lib/hsqldb-1.8.0.10.LICENSE.txt b/lib/hsqldb-1.8.0.10.LICENSE.txt new file mode 100644 index 00000000000..d45b9f8cc07 --- /dev/null +++ b/lib/hsqldb-1.8.0.10.LICENSE.txt @@ -0,0 +1,66 @@ +/* Copyright (c) 1995-2000, The Hypersonic SQL Group. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of the Hypersonic SQL Group nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE HYPERSONIC SQL GROUP, + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * This software consists of voluntary contributions made by many individuals + * on behalf of the Hypersonic SQL Group. + * + * + * For work added by the HSQL Development Group: + * + * Copyright (c) 2001-2004, The HSQL Development Group + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of the HSQL Development Group nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL HSQL DEVELOPMENT GROUP, HSQLDB.ORG, + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + diff --git a/lib/hsqldb-1.8.0.10.jar b/lib/hsqldb-1.8.0.10.jar new file mode 100644 index 00000000000..e010269ddf6 Binary files /dev/null and b/lib/hsqldb-1.8.0.10.jar differ diff --git a/lib/jdiff/hadoop_0.17.0.xml b/lib/jdiff/hadoop_0.17.0.xml new file mode 100644 index 00000000000..69dded31403 --- /dev/null +++ b/lib/jdiff/hadoop_0.17.0.xml @@ -0,0 +1,43272 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + final. + + @param name resource to be added, the classpath is examined for a file + with that name.]]> + + + + + + final. + + @param url url of the resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param file file-path of resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + name property, null if + no such property exists. + + Values are processed for variable expansion + before being returned. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + name property, without doing + variable expansion. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + + value of the name property. + + @param name property name. + @param value property value.]]> + + + + + + + name property. If no such property + exists, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value, or defaultValue if the property + doesn't exist.]]> + + + + + + + name property as an int. + + If no such property exists, or if the specified value is not a valid + int, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as an int, + or defaultValue.]]> + + + + + + + name property to an int. + + @param name property name. + @param value int value of the property.]]> + + + + + + + name property as a long. + If no such property is specified, or if the specified value is not a valid + long, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a long, + or defaultValue.]]> + + + + + + + name property to a long. + + @param name property name. + @param value long value of the property.]]> + + + + + + + name property as a float. + If no such property is specified, or if the specified value is not a valid + float, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a float, + or defaultValue.]]> + + + + + + + name property as a boolean. + If no such property is specified, or if the specified value is not a valid + boolean, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a boolean, + or defaultValue.]]> + + + + + + + name property to a boolean. + + @param name property name. + @param value boolean value of the property.]]> + + + + + + + + + + + + + name property as + an array of Strings. + If no such property is specified then null is returned. + + @param name property name. + @return property value as an array of Strings, + or null.]]> + + + + + + + name property as + an array of Strings. + If no such property is specified then default value is returned. + + @param name property name. + @param defaultValue The default value + @return property value as an array of Strings, + or default value.]]> + + + + + + + name property as + as comma delimited values. + + @param name property name. + @param values The values]]> + + + + + + + + + + + + + + name property as a Class. + If no such property is specified, then defaultValue is + returned. + + @param name the class name. + @param defaultValue default value. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property as a Class + implementing the interface specified by xface. + + If no such property is specified, then defaultValue is + returned. + + An exception is thrown if the returned class does not implement the named + interface. + + @param name the class name. + @param defaultValue default value. + @param xface the interface implemented by the named class. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property to the name of a + theClass implementing the given interface xface. + + An exception is thrown if theClass does not implement the + interface xface. + + @param name property name. + @param theClass property value. + @param xface the interface implemented by the named class.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + + + + + name. + + @param name configuration resource name. + @return an input stream attached to the resource.]]> + + + + + + name. + + @param name configuration resource name. + @return a reader attached to the resource.]]> + + + + + String + key-value pairs in the configuration. + + @return an iterator over the entries.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + true to set quiet-mode on, false + to turn it off.]]> + + + + + + + + + + + Resources + +

Configurations are specified by resources. A resource contains a set of + name/value pairs as XML data. Each resource is named by either a + String or by a {@link Path}. If named by a String, + then the classpath is examined for a file with that name. If named by a + Path, then the local filesystem is examined directly, without + referring to the classpath. + +

Hadoop by default specifies two resources, loaded in-order from the + classpath:

    +
  1. hadoop-default.xml + : Read-only defaults for hadoop.
  2. +
  3. hadoop-site.xml: Site-specific configuration for a given hadoop + installation.
  4. +
+ Applications may add additional resources, which are loaded + subsequent to these resources in the order they are added. + +

Final Parameters

+ +

Configuration parameters may be declared final. + Once a resource declares a value final, no subsequently-loaded + resource can alter that value. + For example, one might define a final parameter with: +

+  <property>
+    <name>dfs.client.buffer.dir</name>
+    <value>/tmp/hadoop/dfs/client</value>
+    <final>true</final>
+  </property>
+ + Administrators typically define parameters as final in + hadoop-site.xml for values that user applications may not alter. + +

Variable Expansion

+ +

Value strings are first processed for variable expansion. The + available properties are:

    +
  1. Other properties defined in this Configuration; and, if a name is + undefined here,
  2. +
  3. Properties in {@link System#getProperties()}.
  4. +
+ +

For example, if a configuration resource contains the following property + definitions: +

+  <property>
+    <name>basedir</name>
+    <value>/user/${user.name}</value>
+  </property>
+  
+  <property>
+    <name>tempdir</name>
+    <value>${basedir}/tmp</value>
+  </property>
+ + When conf.get("tempdir") is called, then ${basedir} + will be resolved to another property in this Configuration, while + ${user.name} would then ordinarily be resolved to the value + of the System property with that name.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The balancer is a tool that balances disk space usage on an HDFS cluster + when some datanodes become full or when new empty nodes join the cluster. + The tool is deployed as an application program that can be run by the + cluster administrator on a live HDFS cluster while applications + adding and deleting files. + +

SYNOPSIS +

+ To start:
+      bin/start-balancer.sh [-threshold ]
+      Example: bin/ start-balancer.sh 
+                     start the balancer with a default threshold of 10%
+               bin/ start-balancer.sh -threshold 5
+                     start the balancer with a threshold of 5%
+ To stop:
+      bin/ stop-balancer.sh
+ 
+ +

DESCRIPTION +

The threshold parameter is a fraction in the range of (0%, 100%) with a + default value of 10%. The threshold sets a target for whether the cluster + is balanced. A cluster is balanced if for each datanode, the utilization + of the node (ratio of used space at the node to total capacity of the node) + differs from the utilization of the (ratio of used space in the cluster + to total capacity of the cluster) by no more than the threshold value. + The smaller the threshold, the more balanced a cluster will become. + It takes more time to run the balancer for small threshold values. + Also for a very small threshold the cluster may not be able to reach the + balanced state when applications write and delete files concurrently. + +

The tool moves blocks from highly utilized datanodes to poorly + utilized datanodes iteratively. In each iteration a datanode moves or + receives no more than the lesser of 10G bytes or the threshold fraction + of its capacity. Each iteration runs no more than 20 minutes. + At the end of each iteration, the balancer obtains updated datanodes + information from the namenode. + +

A system property that limits the balancer's use of bandwidth is + defined in the default configuration file: +

+ 
+   dfs.balance.bandwidthPerSec
+   1048576
+   Specifies the maximum bandwidth that each datanode 
+ can utilize for the balancing purpose in term of the number of bytes 
+ per second. 
+ 
+ 
+ +

This property determines the maximum speed at which a block will be + moved from one datanode to another. The default value is 1MB/s. The higher + the bandwidth, the faster a cluster can reach the balanced state, + but with greater competition with application processes. If an + administrator changes the value of this property in the configuration + file, the change is observed when HDFS is next restarted. + +

MONITERING BALANCER PROGRESS +

After the balancer is started, an output file name where the balancer + progress will be recorded is printed on the screen. The administrator + can monitor the running of the balancer by reading the output file. + The output shows the balancer's status iteration by iteration. In each + iteration it prints the starting time, the iteration number, the total + number of bytes that have been moved in the previous iterations, + the total number of bytes that are left to move in order for the cluster + to be balanced, and the number of bytes that are being moved in this + iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left + To Move" is decreasing. + +

Running multiple instances of the balancer in an HDFS cluster is + prohibited by the tool. + +

The balancer automatically exits when any of the following five + conditions is satisfied: +

    +
  1. The cluster is balanced; +
  2. No block can be moved; +
  3. No block has been moved for five consecutive iterations; +
  4. An IOException occurs while communicating with the namenode; +
  5. Another balancer is running. +
+ +

Upon exit, a balancer returns an exit code and prints one of the + following messages to the output file in corresponding to the above exit + reasons: +

    +
  1. The cluster is balanced. Exiting +
  2. No block can be moved. Exiting... +
  3. No block has been moved for 3 iterations. Exiting... +
  4. Received an IO exception: failure reason. Exiting... +
  5. Another balancer is running. Exiting... +
+ +

The administrator can interrupt the execution of the balancer at any + time by running the command "stop-balancer.sh" on the machine where the + balancer is running.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + in]]> + + + + + + + out.]]> + + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + stream of bytes (of BLOCK_SIZE or less) + + This info is stored on a local disk. The DataNode + reports the table's contents to the NameNode upon startup + and every so often afterwards. + + DataNodes spend their lives in an endless loop of asking + the NameNode for something to do. A NameNode cannot connect + to a DataNode directly; a NameNode simply returns values from + functions invoked by a DataNode. + + DataNodes maintain an open server socket so that client code + or other DataNodes can read/write data. The host/port for + this server is reported to the NameNode, which then sends that + information to clients or other DataNodes that might be interested.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The tool scans all files and directories, starting from an indicated + root path. The following abnormal conditions are detected and handled:

+
    +
  • files with blocks that are completely missing from all datanodes.
    + In this case the tool can perform one of the following actions: +
      +
    • none ({@link NamenodeFsck#FIXING_NONE})
    • +
    • move corrupted files to /lost+found directory on DFS + ({@link NamenodeFsck#FIXING_MOVE}). Remaining data blocks are saved as a + block chains, representing longest consecutive series of valid blocks.
    • +
    • delete corrupted files ({@link NamenodeFsck#FIXING_DELETE})
    • +
    +
  • +
  • detect files with under-replicated or over-replicated blocks
  • +
+ Additionally, the tool collects a detailed overall DFS statistics, and + optionally can print detailed statistics on block locations and replication + factors of each file.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + :/data[/] HTTP/1.1 + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + :/listPaths[/][[&option]*] HTTP/1.1 + } + + Where option (default) in: + recursive ("no") + filter (".*") + exclude ("\..*\.crc") + + Response: A flat list of files/directories in the following format: + {@code + + + + + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The name-node can be started with one of the following startup options: +
    +
  • {@link FSConstants.StartupOption#REGULAR REGULAR} - normal startup
  • +
  • {@link FSConstants.StartupOption#FORMAT FORMAT} - format name node
  • +
  • {@link FSConstants.StartupOption#UPGRADE UPGRADE} - start the cluster + upgrade and create a snapshot of the current file system state
  • +
  • {@link FSConstants.StartupOption#ROLLBACK ROLLBACK} - roll the + cluster back to the previous state
  • +
+ The option is passed via configuration field: + dfs.namenode.startup + + The conf will be modified to reflect the actual ports on which + the NameNode is up and running if the user passes the port as + zero in the conf. + + @param conf confirguration + @throws IOException]]> +
+
+ + + + zero.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + datanode whose + total size is size + + @param datanode on which blocks are located + @param size total size of blocks]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + blocksequence (namespace) + 2) block->machinelist ("inodes") + + The first table is stored on disk and is very precious. + The second table is rebuilt every time the NameNode comes + up. + + 'NameNode' refers to both this class as well as the 'NameNode server'. + The 'FSNamesystem' class actually performs most of the filesystem + management. The majority of the 'NameNode' class itself is concerned + with exposing the IPC interface to the outside world, plus some + configuration management. + + NameNode implements the ClientProtocol interface, which allows + clients to ask for DFS services. ClientProtocol is not + designed for direct use by authors of DFS client code. End-users + should instead use the org.apache.nutch.hadoop.fs.FileSystem class. + + NameNode also implements the DatanodeProtocol interface, used by + DataNode programs that actually store DFS data blocks. These + methods are invoked repeatedly and automatically by all the + DataNodes in a DFS deployment. + + NameNode also implements the NamenodeProtocol interface, used by + secondary namenodes or rebalancing processes to get partial namenode's + state, for example partial blocksMap etc.]]> + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The tool scans all files and directories, starting from an indicated + root path. The following abnormal conditions are detected and handled:

+
    +
  • files with blocks that are completely missing from all datanodes.
    + In this case the tool can perform one of the following actions: +
      +
    • none ({@link #FIXING_NONE})
    • +
    • move corrupted files to /lost+found directory on DFS + ({@link #FIXING_MOVE}). Remaining data blocks are saved as a + block chains, representing longest consecutive series of valid blocks.
    • +
    • delete corrupted files ({@link #FIXING_DELETE})
    • +
    +
  • +
  • detect files with under-replicated or over-replicated blocks
  • +
+ Additionally, the tool collects a detailed overall DFS statistics, and + optionally can print detailed statistics on block locations and replication + factors of each file.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #syncs}.inc()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A distributed implementation of {@link +org.apache.hadoop.fs.FileSystem}. This is loosely modelled after +Google's GFS.

+ +

The most important difference is that unlike GFS, Hadoop DFS files +have strictly one writer at any one time. Bytes are always appended +to the end of the writer's stream. There is no notion of "record appends" +or "mutations" that are then checked or reordered. Writers simply emit +a byte stream. That byte stream is guaranteed to be stored in the +order written.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #blocksRead}.inc()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        dfs.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        dfs.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically. +

+ Name Node Status info is reported in another MBean + @see org.apache.hadoop.dfs.datanode.metrics.FSDatasetMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Data Node runtime statistic info is report in another MBean + @see org.apache.hadoop.dfs.datanode.metrics.DataNodeStatisticsMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Name Node runtime statistic info is report in another MBean + @see org.apache.hadoop.dfs.namenode.metrics.NameNodeStatisticsMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        dfs.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        dfs.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically. +

+ Name Node Status info is report in another MBean + @see org.apache.hadoop.dfs.namenode.metrics.FSNamesystemMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DistributedCache is a facility provided by the Map-Reduce + framework to cache files (text, archives, jars etc.) needed by applications. +

+ +

Applications specify the files, via urls (hdfs:// or http://) to be cached + via the {@link JobConf}. The DistributedCache assumes that the + files specified via hdfs:// urls are already present on the + {@link FileSystem} at the path specified by the url.

+ +

The framework will copy the necessary files on to the slave node before + any tasks for the job are executed on that node. Its efficiency stems from + the fact that the files are only copied once per job and the ability to + cache archives which are un-archived on the slaves.

+ +

DistributedCache can be used to distribute simple, read-only + data/text files and/or more complex types such as archives, jars etc. + Archives (zip files) are un-archived at the slave nodes. Jars maybe be + optionally added to the classpath of the tasks, a rudimentary software + distribution mechanism. Files have execution permissions. Optionally users + can also direct it to symlink the distributed cache file(s) into + the working directory of the task.

+ +

DistributedCache tracks modification timestamps of the cache + files. Clearly the cache files should not be modified by the application + or externally while the job is executing.

+ +

Here is an illustrative example on how to use the + DistributedCache:

+

+     // Setting up the cache for the application
+     
+     1. Copy the requisite files to the FileSystem:
+     
+     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
+     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
+     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+     
+     2. Setup the application's JobConf:
+     
+     JobConf job = new JobConf();
+     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
+                                   job);
+     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+
+     3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+     
+     public static class MapClass extends MapReduceBase  
+     implements Mapper<K, V, K, V> {
+     
+       private Path[] localArchives;
+       private Path[] localFiles;
+       
+       public void configure(JobConf job) {
+         // Get the cached archives/files
+         localArchives = DistributedCache.getLocalCacheArchives(job);
+         localFiles = DistributedCache.getLocalCacheFiles(job);
+       }
+       
+       public void map(K key, V value, 
+                       OutputCollector<K, V> output, Reporter reporter) 
+       throws IOException {
+         // Use data from the cached archives/files here
+         // ...
+         // ...
+         output.collect(k, v);
+       }
+     }
+     
+ 

+ + @see JobConf + @see JobClient]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BufferedFSInputStream + with the specified buffer size, + and saves its argument, the input stream + in, for later use. An internal + buffer array of length size + is created and stored in buf. + + @param in the underlying input stream. + @param size the buffer size. + @exception IllegalArgumentException if size <= 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + setReplication of FileSystem + @param src file name + @param replication new replication + @throws IOException + @return true if successful; + false if file does not exist or is a directory]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ']]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fs.scheme.class whose value names the FileSystem class. + The entire URI is passed to the FileSystem instance's initialize method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f is a file, return the size of the file; + If f is a directory, return the size of the directory tree + @deprecated Use {@link #getContentSummary(Path)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Return all the files that match filePattern and are not checksum + files. Results are sorted by their names. + +

+ A filename pattern is composed of regular characters and + special pattern matching characters, which are: + +

+
+
+

+

? +
Matches any single character. + +

+

* +
Matches zero or more characters. + +

+

[abc] +
Matches a single character from character set + {a,b,c}. + +

+

[a-b] +
Matches a single character from the character range + {a...b}. Note that character a must be + lexicographically less than or equal to character b. + +

+

[^a] +
Matches a single character that is not from character set or range + {a}. Note that the ^ character must occur + immediately to the right of the opening bracket. + +

+

\c +
Removes (escapes) any special meaning of character c. + +

+

{ab,cd} +
Matches a string from the string set {ab, cd} + +

+

{ab,c{de,fh}} +
Matches a string from the string set {ab, cde, cfh} + +
+
+
+ + @param pathPattern a regular expression specifying a pth pattern + + @return an array of paths that match the path pattern + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All user code that may potentially use the Hadoop Distributed + File System should be written to use a FileSystem object. The + Hadoop DFS is a multi-machine system that appears as a single + disk. It's useful because of its fault tolerance and potentially + very large capacity. + +

+ The local implementation is {@link LocalFileSystem} and distributed + implementation is {@link DistributedFileSystem}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FilterFileSystem contains + some other file system, which it uses as + its basic file system, possibly transforming + the data along the way or providing additional + functionality. The class FilterFileSystem + itself simply overrides all methods of + FileSystem with versions that + pass all requests to the contained file + system. Subclasses of FilterFileSystem + may further override some of these methods + and may also provide additional methods + and fields.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + buf at offset + and checksum into checksum. + The method is used for implementing read, therefore, it should be optimized + for sequential reading + @param pos chunkPos + @param buf desitination buffer + @param offset offset in buf at which to store data + @param len maximun number of bytes to read + @return number of bytes read]]> + + + + + + + + + + + + + + + + + -1 if the end of the + stream is reached. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + This method implements the general contract of the corresponding + {@link InputStream#read(byte[], int, int) read} method of + the {@link InputStream} class. As an additional + convenience, it attempts to read as many bytes as possible by repeatedly + invoking the read method of the underlying stream. This + iterated read continues until one of the following + conditions becomes true:

    + +
  • The specified number of bytes have been read, + +
  • The read method of the underlying stream returns + -1, indicating end-of-file. + +
If the first read on the underlying stream returns + -1 to indicate end-of-file then this method returns + -1. Otherwise this method returns the number of bytes + actually read. + + @param b destination buffer. + @param off offset at which to start storing bytes. + @param len maximum number of bytes to read. + @return the number of bytes read, or -1 if the end of + the stream has been reached. + @exception IOException if an I/O error occurs. + ChecksumException if any checksum error occurs]]> +
+ + + + + + + + + + + + n bytes of data from the + input stream. + +

This method may skip more bytes than are remaining in the backing + file. This produces no exception and the number of bytes skipped + may include some number of bytes that were beyond the EOF of the + backing file. Attempting to read from the stream after skipping past + the end will result in -1 indicating the end of the file. + +

If n is negative, no bytes are skipped. + + @param n the number of bytes to be skipped. + @return the actual number of bytes skipped. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to skip to is corrupted]]> + + + + + + + This method may seek past the end of the file. + This produces no exception and an attempt to read from + the stream will result in -1 indicating the end of the file. + + @param pos the postion to seek to. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to seek to is corrupted]]> + + + + + + + + + + len bytes from + stm + + @param stm an input stream + @param buf destiniation buffer + @param offset offset at which to store data + @param len number of bytes to read + @return actual number of bytes read + @throws IOException if there is any IO error]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + len bytes from the specified byte array + starting at offset off and generate a checksum for + each data chunk. + +

This method stores bytes from the given array into this + stream's buffer before it gets checksumed. The buffer gets checksumed + and flushed to the underlying output stream when all data + in a checksum chunk are in the buffer. If the buffer is empty and + requested length is at least as large as the size of next checksum chunk + size, this method will checksum and write the chunk directly + to the underlying output stream. Thus it avoids uneccessary data copy. + + @param b the data. + @param off the start offset in the data. + @param len the number of bytes to write. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if and only if pathname + should be included]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + trash feature. Files are moved to a user's trash + directory, a subdirectory of their home directory named ".Trash". Files are + initially moved to a current sub-directory of the trash directory. + Within that sub-directory their original path is preserved. Periodically + one may checkpoint the current trash and remove older checkpoints. (This + design permits trash management without enumeration of the full trash + content, without date support in the filesystem, and without clock + synchronization.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A client for the Kosmos filesystem (KFS) + +

Introduction

+ +This pages describes how to use Kosmos Filesystem +( KFS ) as a backing +store with Hadoop. This page assumes that you have downloaded the +KFS software and installed necessary binaries as outlined in the KFS +documentation. + +

Steps

+ +
    +
  • In the Hadoop conf directory edit hadoop-default.xml, + add the following: +
    +<property>
    +  <name>fs.kfs.impl</name>
    +  <value>org.apache.hadoop.fs.kfs.KosmosFileSystem</value>
    +  <description>The FileSystem for kfs: uris.</description>
    +</property>
    +            
    + +
  • In the Hadoop conf directory edit hadoop-site.xml, + adding the following (with appropriate values for + <server> and <port>): +
    +<property>
    +  <name>fs.default.name</name>
    +  <value>kfs://<server:port></value> 
    +</property>
    +
    +<property>
    +  <name>fs.kfs.metaServerHost</name>
    +  <value><server></value>
    +  <description>The location of the KFS meta server.</description>
    +</property>
    +
    +<property>
    +  <name>fs.kfs.metaServerPort</name>
    +  <value><port></value>
    +  <description>The location of the meta server's port.</description>
    +</property>
    +
    +
    +
  • + +
  • Copy KFS's kfs-0.1.jar to Hadoop's lib directory. This step + enables Hadoop's to load the KFS specific modules. Note + that, kfs-0.1.jar was built when you compiled KFS source + code. This jar file contains code that calls KFS's client + library code via JNI; the native code is in KFS's + libkfsClient.so library. +
  • + +
  • When the Hadoop map/reduce trackers start up, those +processes (on local as well as remote nodes) will now need to load +KFS's libkfsClient.so library. To simplify this process, it is advisable to +store libkfsClient.so in an NFS accessible directory (similar to where +Hadoop binaries/scripts are stored); then, modify Hadoop's +conf/hadoop-env.sh adding the following line and providing suitable +value for <path>: +
    +export LD_LIBRARY_PATH=<path>
    +
    + + +
  • Start only the map/reduce trackers +
    + example: execute Hadoop's bin/start-mapred.sh
  • +
+
+ +If the map/reduce job trackers start up, all file-I/O is done to KFS.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is a tool for migrating data from an older to a newer version + of an S3 filesystem. +

+

+ All files in the filesystem are migrated by re-writing the block metadata + - no datafiles are touched. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} backed by Amazon S3. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + A distributed implementation of {@link +org.apache.hadoop.fs.FileSystem} that uses Amazon S3.

+ +

+Files are stored in S3 as blocks (represented by +{@link org.apache.hadoop.fs.s3.Block}), which have an ID and a length. +Block metadata is stored in S3 as a small record (represented by +{@link org.apache.hadoop.fs.s3.INode}) using the URL-encoded +path string as a key. Inodes record the file type (regular file or directory) and the list of blocks. +This design makes it easy to seek to any given position in a file by reading the inode data to compute +which block to access, then using S3's support for +HTTP Range headers +to start streaming from the correct position. +Renames are also efficient since only the inode is moved (by a DELETE followed by a PUT since +S3 does not support renames). +

+

+For a single file /dir1/file1 which takes two blocks of storage, the file structure in S3 +would be something like this: +

+
+/
+/dir1
+/dir1/file1
+block-6415776850131549260
+block-3026438247347758425
+
+

+Inodes start with a leading /, while blocks are prefixed with block-. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nth value.]]> + + + + + + + + + + + + + + + + + + + + + nth value in the file.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + public class IntArrayWritable extends ArrayWritable { + public IntArrayWritable() { + super(IntWritable.class); + } + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataInputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using DataInput methods ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataOutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using DataOutput methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + the class of the item + @param conf the configuration to store + @param item the object to be stored + @param keyName the name of the key to use + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param items the objects to be stored + @param keyName the name of the key to use + @throws IndexOutOfBoundsException if the items array is empty + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + DefaultStringifier offers convenience methods to store/load objects to/from + the configuration. + + @param the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a FloatWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When two sequence files, which have same Key type but different Value + types, are mapped out to reduce, multiple Value types is not allowed. + In this case, this class can help you wrap instances with different types. +

+ +

+ Compared with ObjectWritable, this class is much more effective, + because ObjectWritable will append the class declaration as a String + into the output file in every Key-Value pair. +

+ +

+ Generic Writable implements {@link Configurable} interface, so that it will be + configured by the framework. The configuration is passed to the wrapped objects + implementing {@link Configurable} interface before deserialization. +

+ + how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines + the classes which will be wrapped in GenericObject in application. + Attention: this classes defined in getTypes() method, must + implement Writable interface. +

+ + The code looks like this: +
+ public class GenericObject extends GenericWritable {
+ 
+   private static Class[] CLASSES = {
+               ClassType1.class, 
+               ClassType2.class,
+               ClassType3.class,
+               };
+
+   protected Class[] getTypes() {
+       return CLASSES;
+   }
+
+ }
+ 
+ + @since Nov 8, 2006]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new InputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using InputStream methods ...
+ }
+ 
+ @see DataInputBuffer + @see DataOutput]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a IntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + closes the input and output streams + at the end. + @param in InputStrem to read from + @param out OutputStream to write to + @param conf the Configuration object]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a LongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A map is a directory containing two files, the data file, + containing all keys and values in the map, and a smaller index + file, containing a fraction of the keys. The fraction is determined by + {@link Writer#getIndexInterval()}. + +

The index file is read entirely into memory. Thus key implementations + should try to keep themselves small. + +

Map files are created by adding entries in-order. To maintain a large + database, perform updates by copying the previous version of a database and + merging in a sorted change list, to create a new version of the database in + a new file. Sorting large change lists can be done with {@link + SequenceFile.Sorter}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key and + val. Returns true if such a pair exists and false when at + the end of the map]]> + + + + + + + + + + + + + + + + key or if it does not exist, at the first entry + after the named key. + +- * @param key - key that we're trying to find +- * @param val - data value if key is found +- * @return - the key that was the closest match or null if eof.]]> + + + + + + + + + key does not exist, return + the first entry that falls just before the key. Otherwise, + return the record that sorts just after. + @return - the key that was the closest match or null if eof.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is an MD5Hash whose digest contains the + same values.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new OutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ OutputBuffer buffer = new OutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using OutputStream methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
+ @see DataOutputBuffer + @see InputBuffer]]> +
+
+ + + + + + + + + + + + + + + A {@link Comparator} that operates directly on byte representations of + objects. +

+ @param + @see DeserializerComparator]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SequenceFiles are flat files consisting of binary key/value + pairs. + +

SequenceFile provides {@link Writer}, {@link Reader} and + {@link Sorter} classes for writing, reading and sorting respectively.

+ + There are three SequenceFile Writers based on the + {@link CompressionType} used to compress key/value pairs: +
    +
  1. + Writer : Uncompressed records. +
  2. +
  3. + RecordCompressWriter : Record-compressed files, only compress + values. +
  4. +
  5. + BlockCompressWriter : Block-compressed files, both keys & + values are collected in 'blocks' + separately and compressed. The size of + the 'block' is configurable. +
+ +

The actual compression algorithm used to compress key and/or values can be + specified by using the appropriate {@link CompressionCodec}.

+ +

The recommended way is to use the static createWriter methods + provided by the SequenceFile to chose the preferred format.

+ +

The {@link Reader} acts as the bridge and can read any of the above + SequenceFile formats.

+ +

SequenceFile Formats

+ +

Essentially there are 3 different formats for SequenceFiles + depending on the CompressionType specified. All of them share a + common header described below. + +

+
    +
  • + version - 3 bytes of magic header SEQ, followed by 1 byte of actual + version number (e.g. SEQ4 or SEQ6) +
  • +
  • + keyClassName -key class +
  • +
  • + valueClassName - value class +
  • +
  • + compression - A boolean which specifies if compression is turned on for + keys/values in this file. +
  • +
  • + blockCompression - A boolean which specifies if block-compression is + turned on for keys/values in this file. +
  • +
  • + compression codec - CompressionCodec class which is used for + compression of keys and/or values (if compression is + enabled). +
  • +
  • + metadata - {@link Metadata} for this file. +
  • +
  • + sync - A sync marker to denote end of the header. +
  • +
+ +
Uncompressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Record-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Compressed Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Block-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record Block +
      +
    • Compressed key-lengths block-size
    • +
    • Compressed key-lengths block
    • +
    • Compressed keys block-size
    • +
    • Compressed keys block
    • +
    • Compressed value-lengths block-size
    • +
    • Compressed value-lengths block
    • +
    • Compressed values block-size
    • +
    • Compressed values block
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +

The compressed blocks of key lengths and value lengths consist of the + actual lengths of individual keys/values encoded in ZeroCompressedInteger + format.

+ + @see CompressionCodec]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key, skipping its + value. True if another entry exists, and false at end of file.]]> + + + + + + + + key and + val. Returns true if such a pair exists and false when at + end of file]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The position passed must be a position returned by {@link + SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary + position, use {@link SequenceFile.Reader#sync(long)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SegmentDescriptor + @param segments the list of SegmentDescriptors + @param tmpDir the directory to write temporary files into + @return RawKeyValueIterator + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For best performance, applications should make sure that the {@link + Writable#readFields(DataInput)} implementation of their keys is + very efficient. In particular, it should avoid allocating memory.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This always returns a synchronized position. In other words, + immediately after calling {@link SequenceFile.Reader#seek(long)} with a position + returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However + the key may be earlier in the file than key last written when this + method was called (e.g., with block-compression, it may be the first key + in the block that was being written when this method was called).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key. Returns + true if such a key exists and false when at the end of the set.]]> + + + + + + + key. + Returns key, or null if no match exists.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + position. Note that this + method avoids using the converter or doing String instatiation + @return the Unicode scalar value at position or -1 + if the position is invalid or points to a + trailing byte]]> + + + + + + + + + + what in the backing + buffer, starting as position start. The starting + position is measured in bytes and the return value is in + terms of byte position in the buffer. The backing buffer is + not converted to a string for this operation. + @return byte position of the first occurence of the search + string in the UTF-8 buffer or -1 if not found]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a Text with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException.]]> + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException. + @return ByteBuffer: bytes stores at ByteBuffer.array() + and length is ByteBuffer.limit()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In + addition, it provides methods for string traversal without converting the + byte array to a string.

Also includes utilities for + serializing/deserialing a string, coding/decoding a string, checking if a + byte array contains valid UTF8 code, calculating the length of an encoded + string.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a UTF8 with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + Also includes utilities for efficiently reading and writing UTF-8. + + @deprecated replaced by Text]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This is useful when a class may evolve, so that instances written by the + old version of the class may still be processed by the new version. To + handle this situation, {@link #readFields(DataInput)} + implementations should catch {@link VersionMismatchException}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VIntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VLongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + out. + + @param out DataOuput to serialize this object into. + @throws IOException]]> + + + + + + + in. + +

For efficiency, implementations should attempt to re-use storage in the + existing object where possible.

+ + @param in DataInput to deseriablize this object from. + @throws IOException]]> +
+ + + Any key or value type in the Hadoop Map-Reduce + framework implements this interface.

+ +

Implementations typically implement a static read(DataInput) + method which constructs a new instance, calls {@link #readFields(DataInput)} + and returns the instance.

+ +

Example:

+

+     public class MyWritable implements Writable {
+       // Some data     
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public static MyWritable read(DataInput in) throws IOException {
+         MyWritable w = new MyWritable();
+         w.readFields(in);
+         return w;
+       }
+     }
+ 

]]> +
+ + + + + + + + WritableComparables can be compared to each other, typically + via Comparators. Any type which is to be used as a + key in the Hadoop Map-Reduce framework should implement this + interface.

+ +

Example:

+

+     public class MyWritableComparable implements WritableComparable {
+       // Some data
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public int compareTo(MyWritableComparable w) {
+         int thisValue = this.value;
+         int thatValue = ((IntWritable)o).value;
+         return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+       }
+     }
+ 

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The default implementation reads the data into two {@link + WritableComparable}s (using {@link + Writable#readFields(DataInput)}, then calls {@link + #compare(WritableComparable,WritableComparable)}.]]> + + + + + + + The default implementation uses the natural ordering, calling {@link + Comparable#compareTo(Object)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This base implemenation uses the natural ordering. To define alternate + orderings, override {@link #compare(WritableComparable,WritableComparable)}. + +

One may optimize compare-intensive operations by overriding + {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are + provided to assist in optimized implementations of this method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum type + @param in DataInput to read from + @param enumType Class type of Enum + @return Enum represented by String read from DataInput + @throws IOException]]> + + + + + + + + + + + + + + + + len number of bytes in input streamin + @param in input stream + @param len number of bytes to skip + @throws IOException when skipped less number of bytes]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Implementations are assumed to be buffered. This permits clients to + reposition the underlying input stream then call {@link #resetState()}, + without having to also synchronize client buffers.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + true if a preset dictionary is needed for decompression. + @return true if a preset dictionary is needed for decompression]]> + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-lzo library is loaded & initialized; + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair. + http://www.oberhumer.com/opensource/lzo/]]> + + + + + + + + + + + + + + + + + + + + + true if lzo compressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if lzo decompressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-zlib is loaded & initialized + and can be loaded for this job, else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keep trying a limited number of times, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+ + + + + + + + Keep trying for a maximum time, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by the number of tries so far. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by a random + number in the range of [0, 2 to the number of retries) +

]]> +
+
+ + + + + + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + + + A retry policy for RemoteException + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + Try once, and fail by re-throwing the exception. + This corresponds to having no retry mechanism in place. +

]]> +
+
+ + + + Try once, and fail silently for void methods, or by + re-throwing the exception for non-void methods. +

]]> +
+
+ + + + Keep trying forever. +

]]> +
+
+ + + A collection of useful implementations of {@link RetryPolicy}. +

]]> +
+
+ + + + + + + + + + Determines whether the framework should retry a + method for the given exception, and the number + of retries that have been made for that operation + so far. +

+ @param e The exception that caused the method to fail. + @param retries The number of times the method has been retried. + @return true if the method should be retried, + false if the method should not be retried + but shouldn't fail with an exception (only for void methods). + @throws Exception The re-thrown exception e indicating + that the method failed and should not be retried further.]]> +
+
+ + + Specifies a policy for retrying method failures. + Implementations of this interface should be immutable. +

]]> +
+
+ + + + + + + + + + + + Create a proxy for an interface of an implementation class + using the same retry policy for each method in the interface. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param retryPolicy the policy for retirying method call failures + @return the retry proxy]]> +
+
+ + + + + + + Create a proxy for an interface of an implementation class + using the a set of retry policies specified by method name. + If no retry policy is defined for a method then a default of + {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param methodNameToPolicyMap a map of method names to retry policies + @return the retry proxy]]> +
+
+ + + A factory for creating retry proxies. +

]]> +
+
+ + + +A mechanism for selectively retrying methods that throw exceptions under certain circumstances. +

+ +

+Typical usage is +

+ +
+UnreliableImplementation unreliableImpl = new UnreliableImplementation();
+UnreliableInterface unreliable = (UnreliableInterface)
+  RetryProxy.create(UnreliableInterface.class, unreliableImpl,
+    RetryPolicies.retryUpToMaximumCountWithFixedSleep(4, 10, TimeUnit.SECONDS));
+unreliable.call();
+
+ +

+This will retry any method called on unreliable four times - in this case the call() +method - sleeping 10 seconds between +each retry. There are a number of {@link org.apache.hadoop.io.retry.RetryPolicies retry policies} +available, or you can implement a custom one by implementing {@link org.apache.hadoop.io.retry.RetryPolicy}. +It is also possible to specify retry policies on a +{@link org.apache.hadoop.io.retry.RetryProxy#create(Class, Object, Map) per-method basis}. +

]]> +
+
+ + + + + + + + Prepare the deserializer for reading.

]]> +
+
+ + + + + + Deserialize the next object from the underlying input stream. + If the object t is non-null then this deserializer + may set its internal state to the next object read from the input + stream. Otherwise, if the object t is null a new + deserialized object will be created. +

+ @return the deserialized object]]> +
+
+ + + + Close the underlying input stream and clear up any resources.

]]> +
+
+ + + Provides a facility for deserializing objects of type from an + {@link InputStream}. +

+ +

+ Deserializers are stateful, but must not buffer the input since + other producers may read from the input between calls to + {@link #deserialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + A {@link RawComparator} that uses a {@link Deserializer} to deserialize + the objects to be compared so that the standard {@link Comparator} can + be used to compare them. +

+

+ One may optimize compare-intensive operations by using a custom + implementation of {@link RawComparator} that operates directly + on byte representations. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + An experimental {@link Serialization} for Java {@link Serializable} classes. +

+ @see JavaSerializationComparator]]> +
+
+ + + + + + + + + + + + + A {@link RawComparator} that uses a {@link JavaSerialization} + {@link Deserializer} to deserialize objects that are then compared via + their {@link Comparable} interfaces. +

+ @param + @see JavaSerialization]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + Encapsulates a {@link Serializer}/{@link Deserializer} pair. +

+ @param ]]> +
+
+ + + + + + + Serializations are found by reading the io.serializations + property from conf, which is a comma-delimited list of + classnames. +

]]> +
+
+ + + + + + + + + + + + A factory for {@link Serialization}s. +

]]> +
+
+ + + + + + + + Prepare the serializer for writing.

]]> +
+
+ + + + + Serialize t to the underlying output stream.

]]> +
+
+ + + + Close the underlying output stream and clear up any resources.

]]> +
+
+ + + Provides a facility for serializing objects of type to an + {@link OutputStream}. +

+ +

+ Serializers are stateful, but must not buffer the output since + other producers may write to the output between calls to + {@link #serialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + +This package provides a mechanism for using different serialization frameworks +in Hadoop. The property "io.serializations" defines a list of +{@link org.apache.hadoop.io.serializer.Serialization}s that know how to create +{@link org.apache.hadoop.io.serializer.Serializer}s and +{@link org.apache.hadoop.io.serializer.Deserializer}s. +

+ +

+To add a new serialization framework write an implementation of +{@link org.apache.hadoop.io.serializer.Serialization} and add its name to the +"io.serializations" property. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param, to the IPC server running at + address, returning the value. Throws exceptions if there are + network problems or if the remote code threw an exception.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unwraps any IOException. + + @param lookupTypes the desired exception class. + @return IOException, which is either the lookupClass exception or this.]]> + + + + + This unwraps any Throwable that has a constructor taking + a String as a parameter. + Otherwise it returns this. + + @return Throwable]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + protocol is a Java interface. All parameters and return types must + be one of: + +
  • a primitive type, boolean, byte, + char, short, int, long, + float, double, or void; or
  • + +
  • a {@link String}; or
  • + +
  • a {@link Writable}; or
  • + +
  • an array of the above types
+ + All methods in the protocol should throw only IOException. No field data of + the protocol instance is transmitted.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + handlerCount determines + the number of handler threads that will be used to process calls.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #rpcDiscardedOps}.inc(time)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker, + as {@link JobTracker.State} + + @return the current state of the JobTracker.]]> + + + + + + + + + + + + ClusterStatus provides clients with information such as: +

    +
  1. + Size of the cluster. +
  2. +
  3. + Task capacity of the cluster. +
  4. +
  5. + The number of currently running map & reduce tasks. +
  6. +
  7. + State of the JobTracker. +
  8. +

+ +

Clients can query for the latest ClusterStatus, via + {@link JobClient#getClusterStatus()}.

+ + @see JobClient]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If the retain time is zero jobs are not persisted. +

+ A daemon thread cleans up job info files older than the retain time +

+ The retain time can be set with the 'persist.jobstatus.hours' + configuration variable (it is in hours).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter can be of + any {@link Enum} type.

+ +

Counters are bunched into {@link Group}s, each comprising of + counters from a particular Enum class.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of counters, comprising of counters from a particular + counter {@link Enum} class. + +

Grouphandles localization of the class name and the + counter names.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param fs the file system that the file is on + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides generic implementations of + {@link #validateInput(JobConf)} and {@link #getSplits(JobConf, int)}. + Implementations fo FileInputFormat can also override the + {@link #isSplitable(FileSystem, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s.]]> + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the taskid, say + task_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in ${mapred.work.output.dir} during execution + of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the + framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

Note: the value of ${mapred.work.output.dir} during + execution of a particular task-attempt is actually + ${mapred.output.dir}/_temporary/_{$taskid}, and this value is + set by the map-reduce framework. So, just create any side-files in the + path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce + task to take advantage of this feature.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This method is used to validate the input directories when a job is + submitted so that the {@link JobClient} can fail early, with an useful + error message, in case of errors. For e.g. input directory does not exist. +

+ + @param job job configuration. + @throws InvalidInputException if the job does not have valid input]]> +
+
+ + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. + + @param job job configuration. + @param numSplits the desired number of splits, a hint. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + It is the responsibility of the RecordReader to respect + record boundaries while processing the logical split to present a + record-oriented view to the individual task.

+ + @param split the {@link InputSplit} + @param job the job that this split belongs to + @return a {@link RecordReader}]]> +
+
+ + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibilty to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see JobClient + @see FileInputFormat]]> + + + + + + + + + + InputSplit. + + @return the number of bytes in the input split. + @throws IOException]]> + + + + + + InputSplit is + located as an array of Strings. + @throws IOException]]> + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + jobid doesn't correspond to any known job. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient is the primary interface for the user-job to interact + with the {@link JobTracker}. + + JobClient provides facilities to submit jobs, track their + progress, access component-tasks' reports/logs, get the Map-Reduce cluster + status information etc. + +

The job submission process involves: +

    +
  1. + Checking the input and output specifications of the job. +
  2. +
  3. + Computing the {@link InputSplit}s for the job. +
  4. +
  5. + Setup the requisite accounting information for the {@link DistributedCache} + of the job, if necessary. +
  6. +
  7. + Copying the job's jar and configuration to the map-reduce system directory + on the distributed file-system. +
  8. +
  9. + Submitting the job to the JobTracker and optionally monitoring + it's status. +
  10. +

+ + Normally the user creates the application, describes various facets of the + job via {@link JobConf} and then uses the JobClient to submit + the job and monitor its progress. + +

Here is an example on how to use JobClient:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     job.setInputPath(new Path("in"));
+     job.setOutputPath(new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+
+     // Submit the job, then poll for progress until the job is complete
+     JobClient.runJob(job);
+ 

+ +

Job Control

+ +

At times clients would chain map-reduce jobs to accomplish complex tasks + which cannot be done via a single map-reduce job. This is fairly easy since + the output of the job, typically, goes to distributed file-system and that + can be used as the input for the next job.

+ +

However, this also means that the onus on ensuring jobs are complete + (success/failure) lies squarely on the clients. In such situations the + various job-control options are: +

    +
  1. + {@link #runJob(JobConf)} : submits the job and returns only after + the job has completed. +
  2. +
  3. + {@link #submitJob(JobConf)} : only submits the job, then poll the + returned handle to the {@link RunningJob} to query status and make + scheduling decisions. +
  4. +
  5. + {@link JobConf#setJobEndNotificationURI(String)} : setup a notification + on job-completion, thus avoiding polling. +
  6. +

+ + @see JobConf + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if framework should keep the intermediate files + for failed tasks, false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Note: +

+ @param dir the {@link Path} of the output directory for the map-reduce job.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the outputs of the maps are to be compressed, + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This comparator should be provided if the equivalence rules for keys + for sorting the intermediates are different from those for grouping keys + before each call to + {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ +

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + in a single call to the reduce function if K1 and K2 compare as equal.

+ +

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + how keys are sorted, this can be used in conjunction to simulate + secondary sort on values.

+ +

Note: This is not a guarantee of the reduce sort being + stable in any sense. (In any case, with the order of available + map-outputs to the reduce being non-deterministic, it wouldn't make + that much sense.)

+ + @param theClass the comparator class to be used for grouping keys. + It should implement RawComparator. + @see #setOutputKeyComparatorClass(Class)]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. Typically the combiner is same as the + the {@link Reducer} for the job i.e. {@link #getReducerClass()}. + + @return the user-defined combiner class used to combine map-outputs.]]> + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. + +

The combiner is a task-level aggregation operation which, in some cases, + helps to cut down the amount of data transferred from the {@link Mapper} to + the {@link Reducer}, leading to better performance.

+ +

Typically the combiner is same as the Reducer for the + job i.e. {@link #setReducerClass(Class)}.

+ + @param theClass the user-defined combiner class used to combine + map-outputs.]]> +
+
+ + + true. + + @return true if speculative execution be used for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on, else false.]]> + + + + + true. + + @return true if speculative execution be + used for this job for map tasks, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for map tasks, + else false.]]> + + + + + true. + + @return true if speculative execution be used + for reduce tasks for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for reduce tasks, + else false.]]> + + + + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + Note: This is only a hint to the framework. The actual + number of spawned map tasks depends on the number of {@link InputSplit}s + generated by the job's {@link InputFormat#getSplits(JobConf, int)}. + + A custom {@link InputFormat} is typically used to accurately control + the number of map tasks for the job.

+ +

How many maps?

+ +

The number of maps is usually driven by the total size of the inputs + i.e. total number of blocks of the input files.

+ +

The right level of parallelism for maps seems to be around 10-100 maps + per-node, although it has been set up to 300 or so for very cpu-light map + tasks. Task setup takes awhile, so it is best if the maps take at least a + minute to execute.

+ +

The default behavior of file-based {@link InputFormat}s is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of input files. However, the {@link FileSystem} blocksize of the + input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Thus, if you expect 10TB of input data and have a blocksize of 128MB, + you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is + used to set it even higher.

+ + @param n the number of map tasks for this job. + @see InputFormat#getSplits(JobConf, int) + @see FileInputFormat + @see FileSystem#getDefaultBlockSize() + @see FileStatus#getBlockSize()]]> +
+
+ + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + How many reduces? + +

The right number of reduces seems to be 0.95 or + 1.75 multiplied by (<no. of nodes> * + + mapred.tasktracker.reduce.tasks.maximum). +

+ +

With 0.95 all of the reduces can launch immediately and + start transfering map outputs as the maps finish. With 1.75 + the faster nodes will finish their first round of reduces and launch a + second wave of reduces doing a much better job of load balancing.

+ +

Increasing the number of reduces increases the framework overhead, but + increases load balancing and lowers the cost of failures.

+ +

The scaling factors above are slightly less than whole numbers to + reserve a few reduce slots in the framework for speculative-tasks, failures + etc.

+ +

Reducer NONE

+ +

It is legal to set the number of reduce-tasks to zero.

+ +

In this case the output of the map-tasks directly go to distributed + file-system, to the path set by + {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the + framework doesn't sort the map-outputs before writing it out to HDFS.

+ + @param n the number of reduce tasks for this job.]]> +
+
+ + + mapred.map.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per map task.]]> + + + + + + + + + + + mapred.reduce.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per reduce task.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noFailures, the + tasktracker is blacklisted for this job. + + @param noFailures maximum no. of failures of a given job per tasktracker.]]> + + + + + blacklisted for this job. + + @return the maximum no. of failures of a given job per tasktracker.]]> + + + + + failed. + + Defaults to zero, i.e. any failed map-task results in + the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + failed. + + Defaults to zero, i.e. any failed reduce-task results + in the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The debug script can aid debugging of failed map tasks. The script is + given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script needs to be symlinked.

+ +

Here is an example on how to submit a script +

+ job.setMapDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param mDbgScript the script name]]> +
+
+ + + + + + + + + The debug script can aid debugging of failed reduce tasks. The script + is given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script file needs to be symlinked

+ +

Here is an example on how to submit a script +

+ job.setReduceDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param rDbgScript the script name]]> +
+
+ + + + + + + + null if it hasn't + been set. + @see #setJobEndNotificationURI(String)]]> + + + + + + The uri can contain 2 special parameters: $jobId and + $jobStatus. Those, if present, are replaced by the job's + identifier and completion-status respectively.

+ +

This is typically used by application-writers to implement chaining of + Map-Reduce jobs in an asynchronous manner.

+ + @param uri the job end notification uri + @see JobStatus + @see Job Completion and Chaining]]> +
+
+ + + + When a job starts, a shared directory is created at location + + ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ . + This directory is exposed to the users through + job.local.dir . + So, the tasks can use this space + as scratch space and share files among them.

+ This value is available as System property also. + + @return The localized job specific shared directory]]> +
+
+ + JobConf is the primary interface for a user to describe a + map-reduce job to the Hadoop framework for execution. The framework tries to + faithfully execute the job as-is described by JobConf, however: +
    +
  1. + Some configuration parameters might have been marked as + + final by administrators and hence cannot be altered. +
  2. +
  3. + While some job parameters are straight-forward to set + (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly + rest of the framework and/or job-configuration and is relatively more + complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}). +
  4. +

+ +

JobConf typically specifies the {@link Mapper}, combiner + (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and + {@link OutputFormat} implementations to be used etc. + +

Optionally JobConf is used to specify other advanced facets + of the job such as Comparators to be used, files to be put in + the {@link DistributedCache}, whether or not intermediate and/or job outputs + are to be compressed (and how), debugability via user-provided scripts + ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), + for doing post-processing on task logs, task's stdout, stderr, syslog. + and etc.

+ +

Here is an example on how to configure a job via JobConf:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     FileInputFormat.setInputPaths(job, new Path("in"));
+     FileOutputFormat.setOutputPath(job, new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setCombinerClass(MyJob.MyReducer.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+     
+     job.setInputFormat(SequenceFileInputFormat.class);
+     job.setOutputFormat(SequenceFileOutputFormat.class);
+ 

+ + @see JobClient + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -archives + -files inputjar args]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + system-dir/jobName.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + zero. + + @param conf configuration for the JobTracker. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + io.file.buffer.size specified in the given + Configuration. + @param in input stream + @param conf configuration + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Output pairs need not be of the same types as input pairs. A given + input pair may map to zero or many output pairs. Output pairs are + collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the input key. + @param value the input value. + @param output collects mapped keys and values. + @param reporter facility to report progress.]]> +
+
+ + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link JobConf} for the + job via the {@link JobConfigurable#configure(JobConf)} and initialize + themselves. Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

The framework then calls + {@link #map(Object, Object, OutputCollector, Reporter)} + for each key/value pair in the InputSplit for that task.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the grouping by specifying + a Comparator via + {@link JobConf#setOutputKeyComparatorClass(Class)}.

+ +

The grouped Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

The intermediate, grouped outputs are always stored in + {@link SequenceFile}s. Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the JobConf.

+ +

If the job has + zero + reduces then the output of the Mapper is directly written + to the {@link FileSystem} without grouping by keys.

+ +

Example:

+

+     public class MyMapper<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Mapper<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+       
+       private String mapTaskId;
+       private String inputFile;
+       private int noRecords = 0;
+       
+       public void configure(JobConf job) {
+         mapTaskId = job.get("mapred.task.id");
+         inputFile = job.get("mapred.input.file");
+       }
+       
+       public void map(K key, V val,
+                       OutputCollector<K, V> output, Reporter reporter)
+       throws IOException {
+         // Process the <key, value> pair (assume this takes a while)
+         // ...
+         // ...
+         
+         // Let the framework know that we are alive, and kicking!
+         // reporter.progress();
+         
+         // Process some more
+         // ...
+         // ...
+         
+         // Increment the no. of <key, value> pairs processed
+         ++noRecords;
+
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+        
+         // Every 100 records update application-level status
+         if ((noRecords%100) == 0) {
+           reporter.setStatus(mapTaskId + " processed " + noRecords + 
+                              " from input-file: " + inputFile); 
+         }
+         
+         // Output the result
+         output.collect(key, val);
+       }
+     }
+ 

+ +

Applications may write a custom {@link MapRunnable} to exert greater + control on map processing e.g. multi-threaded Mappers etc.

+ + @see JobConf + @see InputFormat + @see Partitioner + @see Reducer + @see MapReduceBase + @see MapRunnable + @see SequenceFile]]> +
+
+ + + + + + + + + + + + + + + + + + + + + Provides default no-op implementations for a few methods, most non-trivial + applications need to override some of them.

]]> +
+
+ + + + + + + + + + + <key, value> pairs. + +

Mapping of input records to output records is complete when this method + returns.

+ + @param input the {@link RecordReader} to read the input records. + @param output the {@link OutputCollector} to collect the outputrecords. + @param reporter {@link Reporter} to report progress, status-updates etc. + @throws IOException]]> +
+
+ + Custom implementations of MapRunnable can exert greater + control on map processing e.g. multi-threaded, asynchronous mappers etc.

+ + @see Mapper]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly + equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)} + to construct RecordReader's for MultiFileSplit's. + @see MultiFileSplit]]> +
+
+ + + + + + + + + + + + + + + + + th Path]]> + + + + + + + + + + + th Path]]> + + + + + + + + + + + + + + + + + + + + + + + MultiFileSplit can be used to implement {@link RecordReader}'s, with + reading one record per file. + @see FileSplit + @see MultiFileInputFormat]]> + + + + + + + + + + + + + + + <key, value> pairs output by {@link Mapper}s + and {@link Reducer}s. + +

OutputCollector is the generalization of the facility + provided by the Map-Reduce framework to collect data output by either the + Mapper or the Reducer i.e. intermediate outputs + or the output of the job.

]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param ignored + @param job job configuration. + @throws IOException when output should not be attempted]]> +
+
+ + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter + @see JobConf]]> +
+
+ + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be paritioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0 to 1.0. + @throws IOException]]> + + + + RecordReader reads <key, value> pairs from an + {@link InputSplit}. + +

RecordReader, typically, converts the byte-oriented view of + the input, provided by the InputSplit, and presents a + record-oriented view for the {@link Mapper} & {@link Reducer} tasks for + processing. It thus assumes the responsibility of processing record + boundaries and presenting the tasks with keys and values.

+ + @see InputSplit + @see InputFormat]]> +
+
+ + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param reporter facility to report progress. + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + Reduces values for a given key. + +

The framework calls this method for each + <key, (list of values)> pair in the grouped inputs. + Output values must be of the same type as input values. Input keys must + not be altered. Typically all values are combined into zero or one value. +

+ +

Output pairs are collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the key. + @param values the list of values to reduce. + @param output to collect keys and combined values. + @param reporter facility to report progress.]]> +
+ + + The number of Reducers for the job is set by the user via + {@link JobConf#setNumReduceTasks(int)}. Reducer implementations + can access the {@link JobConf} for the job via the + {@link JobConfigurable#configure(JobConf)} method and initialize themselves. + Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    Reducer is input the grouped output of a {@link Mapper}. + In the phase the framework, for each Reducer, fetches the + relevant partition of the output of all the Mappers, via HTTP. +

    +
  2. + +
  3. +

    Sort

    + +

    The framework groups Reducer inputs by keys + (since different Mappers may have output the same key) in this + stage.

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    If equivalence rules for keys while grouping the intermediates are + different from those for grouping keys before reduction, then one may + specify a Comparator via + {@link JobConf#setOutputValueGroupingComparator(Class)}.Since + {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to + control how intermediate keys are grouped, these can be used in conjunction + to simulate secondary sort on values.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterator, OutputCollector, Reporter)} + method is called for each <key, (list of values)> pair in + the grouped inputs.

    +

    The output of the reduce task is typically written to the + {@link FileSystem} via + {@link OutputCollector#collect(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+     public class MyReducer<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Reducer<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+        
+       private String reduceTaskId;
+       private int noKeys = 0;
+       
+       public void configure(JobConf job) {
+         reduceTaskId = job.get("mapred.task.id");
+       }
+       
+       public void reduce(K key, Iterator<V> values,
+                          OutputCollector<K, V> output, 
+                          Reporter reporter)
+       throws IOException {
+       
+         // Process
+         int noValues = 0;
+         while (values.hasNext()) {
+           V value = values.next();
+           
+           // Increment the no. of values for this key
+           ++noValues;
+           
+           // Process the <key, value> pair (assume this takes a while)
+           // ...
+           // ...
+           
+           // Let the framework know that we are alive, and kicking!
+           if ((noValues%10) == 0) {
+             reporter.progress();
+           }
+         
+           // Process some more
+           // ...
+           // ...
+           
+           // Output the <key, value> 
+           output.collect(key, value);
+         }
+         
+         // Increment the no. of <key, list of values> pairs processed
+         ++noKeys;
+         
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+         
+         // Every 100 keys update application-level status
+         if ((noKeys%100) == 0) {
+           reporter.setStatus(reduceTaskId + " processed " + noKeys);
+         }
+       }
+     }
+ 

+ + @see Mapper + @see Partitioner + @see Reporter + @see MapReduceBase]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum. + @param amount A non-negative amount by which the counter is to + be incremented.]]> + + + + + + InputSplit that the map is reading from. + @throws UnsupportedOperationException if called outside a mapper]]> + + + + + + + + + {@link Mapper} and {@link Reducer} can use the Reporter + provided to report progress or just indicate that they are alive. In + scenarios where the application takes an insignificant amount of time to + process individual key/value pairs, this is crucial since the framework + might assume that the task has timed-out and kill that task. + +

Applications can also update {@link Counters} via the provided + Reporter .

+ + @see Progressable + @see Counters]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RunningJob is the user-interface to query for details on a + running Map-Reduce job. + +

Clients can get hold of RunningJob via the {@link JobClient} + and then query the running-job for details such as name, configuration, + progress etc.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f. The filtering criteria is + MD5(key) % f == 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + f using + the criteria record# % f == 0. + For example, if the frequency is 10, one out of 10 records is returned.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + @param name The name of the server + @param port The port to use on the server + @param findPort whether the server should start at the given port and + increment by 1 until it finds a free port.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points to the log directory + "/static/" -> points to common static files (src/webapps/static) + "/" -> the jsp server code from (src/webapps/)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.log.dir.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A software framework for easily writing applications which process vast +amounts of data (multi-terabyte data-sets) parallelly on large clusters +(thousands of nodes) built of commodity hardware in a reliable, fault-tolerant +manner.

+ +

A Map-Reduce job usually splits the input data-set into independent +chunks which processed by map tasks in completely parallel manner, +followed by reduce tasks which aggregating their output. Typically both +the input and the output of the job are stored in a +{@link org.apache.hadoop.fs.FileSystem}. The framework takes care of monitoring +tasks and re-executing failed ones. Since, usually, the compute nodes and the +storage nodes are the same i.e. Hadoop's Map-Reduce framework and Distributed +FileSystem are running on the same set of nodes, tasks are effectively scheduled +on the nodes where data is already present, resulting in very high aggregate +bandwidth across the cluster.

+ +

The Map-Reduce framework operates exclusively on <key, value> +pairs i.e. the input to the job is viewed as a set of <key, value> +pairs and the output as another, possibly different, set of +<key, value> pairs. The keys and values have to +be serializable as {@link org.apache.hadoop.io.Writable}s and additionally the +keys have to be {@link org.apache.hadoop.io.WritableComparable}s in +order to facilitate grouping by the framework.

+ +

Data flow:

+
+                                (input)
+                                <k1, v1>
+       
+                                   |
+                                   V
+       
+                                  map
+       
+                                   |
+                                   V
+
+                                <k2, v2>
+       
+                                   |
+                                   V
+       
+                                combine
+       
+                                   |
+                                   V
+       
+                                <k2, v2>
+       
+                                   |
+                                   V
+       
+                                 reduce
+       
+                                   |
+                                   V
+       
+                                <k3, v3>
+                                (output)
+
+ +

Applications typically implement +{@link org.apache.hadoop.mapred.Mapper#map(Object, Object, OutputCollector, Reporter)} +and +{@link org.apache.hadoop.mapred.Reducer#reduce(Object, Iterator, OutputCollector, Reporter)} +methods. The application-writer also specifies various facets of the job such +as input and output locations, the Partitioner, InputFormat +& OutputFormat implementations to be used etc. as +a {@link org.apache.hadoop.mapred.JobConf}. The client program, +{@link org.apache.hadoop.mapred.JobClient}, then submits the job to the framework +and optionally monitors it.

+ +

The framework spawns one map task per +{@link org.apache.hadoop.mapred.InputSplit} generated by the +{@link org.apache.hadoop.mapred.InputFormat} of the job and calls +{@link org.apache.hadoop.mapred.Mapper#map(Object, Object, OutputCollector, Reporter)} +with each <key, value> pair read by the +{@link org.apache.hadoop.mapred.RecordReader} from the InputSplit for +the task. The intermediate outputs of the maps are then grouped by keys +and optionally aggregated by combiner. The key space of intermediate +outputs are paritioned by the {@link org.apache.hadoop.mapred.Partitioner}, where +the number of partitions is exactly the number of reduce tasks for the job.

+ +

The reduce tasks fetch the sorted intermediate outputs of the maps, via http, +merge the <key, value> pairs and call +{@link org.apache.hadoop.mapred.Reducer#reduce(Object, Iterator, OutputCollector, Reporter)} +for each <key, list of values> pair. The output of the reduce tasks' is +stored on the FileSystem by the +{@link org.apache.hadoop.mapred.RecordWriter} provided by the +{@link org.apache.hadoop.mapred.OutputFormat} of the job.

+ +

Map-Reduce application to perform a distributed grep:

+

+public class Grep extends Configured implements Tool {
+
+  // map: Search for the pattern specified by 'grep.mapper.regex' &
+  //      'grep.mapper.regex.group'
+
+  class GrepMapper<K, Text> 
+  extends MapReduceBase  implements Mapper<K, Text, Text, LongWritable> {
+
+    private Pattern pattern;
+    private int group;
+
+    public void configure(JobConf job) {
+      pattern = Pattern.compile(job.get("grep.mapper.regex"));
+      group = job.getInt("grep.mapper.regex.group", 0);
+    }
+
+    public void map(K key, Text value,
+                    OutputCollector<Text, LongWritable> output,
+                    Reporter reporter)
+    throws IOException {
+      String text = value.toString();
+      Matcher matcher = pattern.matcher(text);
+      while (matcher.find()) {
+        output.collect(new Text(matcher.group(group)), new LongWritable(1));
+      }
+    }
+  }
+
+  // reduce: Count the number of occurrences of the pattern
+
+  class GrepReducer<K> extends MapReduceBase
+  implements Reducer<K, LongWritable, K, LongWritable> {
+
+    public void reduce(K key, Iterator<LongWritable> values,
+                       OutputCollector<K, LongWritable> output,
+                       Reporter reporter)
+    throws IOException {
+
+      // sum all values for this key
+      long sum = 0;
+      while (values.hasNext()) {
+        sum += values.next().get();
+      }
+
+      // output sum
+      output.collect(key, new LongWritable(sum));
+    }
+  }
+  
+  public int run(String[] args) throws Exception {
+    if (args.length < 3) {
+      System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
+      ToolRunner.printGenericCommandUsage(System.out);
+      return -1;
+    }
+
+    JobConf grepJob = new JobConf(getConf(), Grep.class);
+    
+    grepJob.setJobName("grep");
+
+    grepJob.setInputPath(new Path(args[0]));
+    grepJob.setOutputPath(args[1]);
+
+    grepJob.setMapperClass(GrepMapper.class);
+    grepJob.setCombinerClass(GrepReducer.class);
+    grepJob.setReducerClass(GrepReducer.class);
+
+    grepJob.set("mapred.mapper.regex", args[2]);
+    if (args.length == 4)
+      grepJob.set("mapred.mapper.regex.group", args[3]);
+
+    grepJob.setOutputFormat(SequenceFileOutputFormat.class);
+    grepJob.setOutputKeyClass(Text.class);
+    grepJob.setOutputValueClass(LongWritable.class);
+
+    JobClient.runJob(grepJob);
+
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new Grep(), args);
+    System.exit(res);
+  }
+
+}
+
+ +

Notice how the data-flow of the above grep job is very similar to doing the +same via the unix pipeline:

+ +
+cat input/*   |   grep   |   sort    |   uniq -c   >   out
+
+ +
+      input   |    map   |  shuffle  |   reduce    >   out
+
+ +

Hadoop Map-Reduce applications need not be written in +JavaTM only. +Hadoop Streaming is a utility +which allows users to create and run jobs with any executables (e.g. shell +utilities) as the mapper and/or the reducer. +Hadoop Pipes is a +SWIG-compatible C++ API to implement +Map-Reduce applications (non JNITM based).

+ +

See Google's original +Map/Reduce paper for background information.

+ +

Java and JNI are trademarks or registered trademarks of +Sun Microsystems, Inc. in the United States and other countries.

]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the Job was added.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Utilities for managing dependent jobs.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ([,]*) + func ::= tbl(,"") + class ::= @see java.lang.Class#forName(java.lang.String) + path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) + } + Reads expression from the mapred.join.expr property and + user-supplied join types from mapred.join.define.<ident> + types. Paths supplied to tbl are given as input paths to the + InputFormat class listed. + @see #compose(java.lang.String, java.lang.Class, java.lang.String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ,

) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + mapred.join.define.<ident> to a classname. In the expression + mapred.join.expr, the identifier will be assumed to be a + ComposableRecordReader. + mapred.join.keycomparator can be a classname used to compare keys + in the join. + @see JoinRecordReader + @see MultiFilterRecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + capacity children to position + id in the parent reader. + The id of a root CompositeRecordReader is -1 by convention, but relying + on this is not recommended.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + override(S1,S2,S3) will prefer values + from S3 over S2, and values from S2 over S1 for all keys + emitted from all sources.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [,,...,]]]> + + + + + + + out. + TupleWritable format: + {@code + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Given a set of sorted datasets keyed with the same class and yielding equal +partitions, it is possible to effect a join of those datasets prior to the map. +This could save costs in re-partitioning, sorting, shuffling, and writing out +data required in the general case.

+ +

Interface

+ +

The attached code offers the following interface to users of these +classes.

+ + + + + + + + + +
propertyrequiredvalue
mapred.join.expryesJoin expression to effect over input data
mapred.join.keycomparatornoWritableComparator class to use for comparing keys
mapred.join.define.<ident>noClass mapped to identifier in join expression
+ +

The join expression understands the following grammar:

+ +
func ::= <ident>([<func>,]*<func>)
+func ::= tbl(<class>,"<path>");
+
+
+ +

Operations included in this patch are partitioned into one of two types: +join operations emitting tuples and "multi-filter" operations emitting a +single value from (but not necessarily included in) a set of input values. +For a given key, each operation will consider the cross product of all +values for all sources at that node.

+ +

Identifiers supported by default:

+ + + + + + + +
identifiertypedescription
innerJoinFull inner join
outerJoinFull outer join
overrideMultiFilterFor a given key, prefer values from the rightmost source
+ +

A user of this class must set the InputFormat for the job to +CompositeInputFormat and define a join expression accepted by the +preceding grammar. For example, both of the following are acceptable:

+ +
inner(tbl(org.apache.hadoop.mapred.SequenceFileInputFormat.class,
+          "hdfs://host:8020/foo/bar"),
+      tbl(org.apache.hadoop.mapred.SequenceFileInputFormat.class,
+          "hdfs://host:8020/foo/baz"))
+
+outer(override(tbl(org.apache.hadoop.mapred.SequenceFileInputFormat.class,
+                   "hdfs://host:8020/foo/bar"),
+               tbl(org.apache.hadoop.mapred.SequenceFileInputFormat.class,
+                   "hdfs://host:8020/foo/baz")),
+      tbl(org.apache.hadoop.mapred/SequenceFileInputFormat.class,
+          "hdfs://host:8020/foo/rab"))
+
+ +

CompositeInputFormat includes a handful of convenience methods to +aid construction of these verbose statements.

+ +

As in the second example, joins may be nested. Users may provide a +comparator class in the mapred.join.keycomparator property to specify +the ordering of their keys, or accept the default comparator as returned by +WritableComparator.get(keyclass).

+ +

Users can specify their own join operations, typically by overriding +JoinRecordReader or MultiFilterRecordReader and mapping that +class to an identifier in the join expression using the +mapred.join.define.ident property, where ident is +the identifier appearing in the join expression. Users may elect to emit- or +modify- values passing through their join operation. Consulting the existing +operations for guidance is recommended. Adding arguments is considerably more +complex (and only partially supported), as one must also add a Node +type to the parse tree. One is probably better off extending +RecordReader in most cases.

+ +JIRA]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Map implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured to use this MapRunnable class (using + the JobConf.setMapRunnerClass method) and + the number of thread the thread-pool can use with the + mapred.map.multithreadedrunner.threads property, its default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pairs. Uses + {@link StringTokenizer} to break text into tokens.]]> + + + + + Library of generally useful mappers, reducers, and partitioners.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + generateKeyValPairs(Object key, Object value); public void + configure(JobConfjob); } + + The package also provides a base class, ValueAggregatorBaseDescriptor, + implementing the above interface. The user can extend the base class and + implement generateKeyValPairs accordingly. + + The primary work of generateKeyValPairs is to emit one or more key/value + pairs based on the input key/value pair. The key in an output key/value pair + encode two pieces of information: aggregation type and aggregation id. The + value will be aggregated onto the aggregation id according the aggregation + type. + + This class offers a function to generate a map/reduce job using Aggregate + framework. The function takes the following parameters: input directory spec + input format (text or sequence file) output directory a file specifying the + user plugin class]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Aggregate framework

+

+Generally speaking, in order to implement an application using Map/Reduce +model, the developer needs to implement Map and Reduce functions (and possibly +Combine function). However, for a lot of applications related to counting and +statistics computing, these functions have very similar +characteristics. This provides a package implementing +those patterns. In particular, the package provides a generic mapper class, +a reducer class and a combiner class, and a set of built-in value aggregators. +It also provides a generic utility class, ValueAggregatorJob, that offers a static function that +creates map/reduce jobs: +

+
+public static JobConf createValueAggregatorJob(String args[]) throws IOException;
+
+
+To call this function, the user needs to pass in arguments specifying the input directories, the output directory, +the number of reducers, the input data format (textinputformat or sequencefileinputformat), and a file specifying user plugin class(es) to load by the mapper. +A user plugin class is responsible for specifying what +aggregators to use and what values are for which aggregators. +A plugin class must implement the following interface: +
+
+ public interface ValueAggregatorDescriptor { 
+     public ArrayList<Entry> generateKeyValPairs(Object key, Object value); 
+     public void configure(JobConfjob); 
+} 
+
+
+Function generateKeyValPairs will generate aggregation key/value pairs for the +input key/value pair. Each aggregation key encodes two pieces of information: the aggregation type and aggregation ID. +The value is the value to be aggregated onto the aggregation ID according to the aggregation type. Here +is a simple example user plugin class for counting the words in the input texts: +
+
+public class WordCountAggregatorDescriptor extends ValueAggregatorBaseDescriptor { 
+    public ArrayList<Entry> generateKeyValPairs(Object key, Object val) {
+        String words [] = val.toString().split(" |\t");
+        ArrayList<Entry> retv = new ArrayList<Entry>();
+        for (int i = 0; i < words.length; i++) {
+            retv.add(generateEntry(LONG_VALUE_SUM, words[i], ONE))
+        }
+        return retv;
+    }
+    public void configure(JobConf job) {}
+} 
+
+
+In the above code, LONG_VALUE_SUM is a string denoting the aggregation type LongValueSum, which sums over long values. +ONE denotes a string "1". Function generateEntry(LONG_VALUE_SUM, words[i], ONE) will inperpret the first argument as an aggregation type, the second as an aggregation ID, and the third argumnent as the value to be aggregated. The output will look like: "LongValueSum:xxxx", where XXXX is the string value of words[i]. The value will be "1". The mapper will call generateKeyValPairs(Object key, Object val) for each input key/value pair to generate the desired aggregation id/value pairs. +The down stream combiner/reducer will interpret these pairs as adding one to the aggregator XXXX. +

+Class ValueAggregatorBaseDescriptor is a base class that user plugin classes can extend. Here is the XML fragment specifying the user plugin class: +

+
+<property>
+    <name>aggregator.descriptor.num</name>
+    <value>1</value>
+</property>
+<property>
+   <name>aggregator.descriptor.0</name>
+   <value>UserDefined,org.apache.hadoop.mapred.lib.aggregate.examples.WordCountAggregatorDescriptor</value>
+</property> 
+
+
+Class ValueAggregatorBaseDescriptor itself provides a default implementation for generateKeyValPairs: +
+
+public ArrayList<Entry> generateKeyValPairs(Object key, Object val) {
+   ArrayList<Entry> retv = new ArrayList<Entry>();     
+   String countType = LONG_VALUE_SUM;
+   String id = "record_count";
+   retv.add(generateEntry(countType, id, ONE));
+   return retv;
+}
+
+
+Thus, if no user plugin class is specified, the default behavior of the map/reduce job is to count the number of records (lines) in the imput files. +

+During runtime, the mapper will invoke the generateKeyValPairs function for each input key/value pair, and emit the generated +key/value pairs: +

+
+public void map(WritableComparable key, Writable value,
+            OutputCollector output, Reporter reporter) throws IOException {
+   Iterator iter = this.aggregatorDescriptorList.iterator();
+   while (iter.hasNext()) {
+       ValueAggregatorDescriptor ad = (ValueAggregatorDescriptor) iter.next();
+       Iterator<Entry> ens = ad.generateKeyValPairs(key, value).iterator();
+       while (ens.hasNext()) {
+           Entry en = ens.next();
+           output.collect((WritableComparable)en.getKey(), (Writable)en.getValue());
+       }
+   }
+}
+
+
+The reducer will create an aggregator object for each key/value list pair, and perform the appropriate aggregation. +At the end, it will emit the aggregator's results: +
+
+public void reduce(WritableComparable key, Iterator values,
+            OutputCollector output, Reporter reporter) throws IOException {
+   String keyStr = key.toString();
+   int pos = keyStr.indexOf(ValueAggregatorDescriptor.TYPE_SEPARATOR);
+   String type = keyStr.substring(0,pos);
+   keyStr = keyStr.substring(pos+ValueAggregatorDescriptor.TYPE_SEPARATOR.length());       
+   ValueAggregator aggregator = 
+       ValueAggregatorBaseDescriptor.generateValueAggregator(type);
+   while (values.hasNext()) {
+       aggregator.addNextValue(values.next());
+   }         
+   String val = aggregator.getReport();
+   key = new Text(keyStr);
+   output.collect(key, new Text(val)); 
+}
+
+
+In order to be able to use combiner, all the aggregation type be aggregators must be associative and communitive. +The following are the types supported:
    +
  • LongValueSum: sum over long values +
  • DoubleValueSum: sum over float/double values +
  • uniqValueCount: count the number of distinct values +
  • ValueHistogram: compute the histogram of values compute the minimum, maximum, media,average, standard deviation of numeric values +
+

+

Create and run an application

+

+To create an application, the user needs to do the following things: +

+1. Implement a user plugin: +

+
+import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorBaseDescriptor;
+import org.apache.hadoop.mapred.JobConf;
+
+public class WordCountAggregatorDescriptor extends ValueAggregatorBaseDescriptor {
+   public void map(WritableComparable key, Writable value,
+            OutputCollector output, Reporter reporter) throws IOException {
+   }
+   public void configure(JobConf job) {
+    
+   } 
+}
+
+
+ +2. Create an xml file specifying the user plugin. +

+3. Compile your java class and create a jar file, say wc.jar. + +

+Finally, run the job: +

+
+        hadoop jar wc.jar org.apache.hadoop.mapred.lib.aggregate..ValueAggregatorJob indirs outdir numofreducers textinputformat|sequencefileinputformat spec_file
+
+
+

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +The class org.apache.hadoop.mapred.pipes.Submitter has a public static +method to submit a job as a JobConf and a main method that takes an +application and optional configuration file, input directories, and +output directory. The cli for the main looks like: + +

+bin/hadoop pipes \
+  [-conf path] \
+  [-input inputDir] \
+  [-output outputDir] \
+  [-jar applicationJarFile] \
+  [-inputformat class] \
+  [-map class] \
+  [-partitioner class] \
+  [-reduce class] \
+  [-writer class] \
+  [-program program url]
+
+ +

+ +The application programs link against a thin C++ wrapper library that +handles the communication with the rest of the Hadoop system. The C++ +interface is "swigable" so that interfaces can be generated for python +and other scripting languages. All of the C++ functions and classes +are in the HadoopPipes namespace. The job may consist of any +combination of Java and C++ RecordReaders, Mappers, Paritioner, +Combiner, Reducer, and RecordWriter. + +

+ +Hadoop Pipes has a generic Java class for handling the mapper and +reducer (PipesMapRunner and PipesReducer). They fork off the +application program and communicate with it over a socket. The +communication is handled by the C++ wrapper library and the +PipesMapRunner and PipesReducer. + +

+ +The application program passes in a factory object that can create +the various objects needed by the framework to the runTask +function. The framework creates the Mapper or Reducer as +appropriate and calls the map or reduce method to invoke the +application's code. The JobConf is available to the application. + +

+ +The Mapper and Reducer objects get all of their inputs, outputs, and +context via context objects. The advantage of using the context +objects is that their interface can be extended with additional +methods without breaking clients. Although this interface is different +from the current Java interface, the plan is to migrate the Java +interface in this direction. + +

+ +Although the Java implementation is typed, the C++ interfaces of keys +and values is just a byte buffer. Since STL strings provide precisely +the right functionality and are standard, they will be used. The +decision to not use stronger types was to simplify the interface. + +

+ +The application can also define combiner functions. The combiner will +be run locally by the framework in the application process to avoid +the round trip to the Java process and back. Because the compare +function is not available in C++, the combiner will use memcmp to +sort the inputs to the combiner. This is not as general as the Java +equivalent, which uses the user's comparator, but should cover the +majority of the use cases. As the map function outputs key/value +pairs, they will be buffered. When the buffer is full, it will be +sorted and passed to the combiner. The output of the combiner will be +sent to the Java process. + +

+ +The application can also set a partition function to control which key +is given to a particular reduce. If a partition function is not +defined, the Java one will be used. The partition function will be +called by the C++ framework before the key/value pair is sent back to +Java.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When constructing the instance, if the factory property + contextName.class exists, + its value is taken to be the name of the class to instantiate. Otherwise, + the default is to create an instance of + org.apache.hadoop.metrics.spi.NullContext, which is a + dummy "no-op" context which will cause all metric data to be discarded. + + @param contextName the name of the context + @return the named MetricsContext]]> + + + + + + + + + + + + + + When the instance is constructed, this method checks if the file + hadoop-metrics.properties exists on the class path. If it + exists, it must be in the format defined by java.util.Properties, and all + the properties in the file are set as attributes on the newly created + ContextFactory instance. + + @return the singleton ContextFactory instance]]> + + + + getFactory() method.]]> + + + + + + + + + + + + + + + + + + + startMonitoring() again after calling + this. + @see #close()]]> + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record name identifies the kind of data to be reported. For example, a + program reporting statistics relating to the disks on a computer might use + a record name "diskStats".

+ + A record has zero or more tags. A tag has a name and a value. To + continue the example, the "diskStats" record might use a tag named + "diskName" to identify a particular disk. Sometimes it is useful to have + more than one tag, so there might also be a "diskType" with value "ide" or + "scsi" or whatever.

+ + A record also has zero or more metrics. These are the named + values that are to be reported to the metrics system. In the "diskStats" + example, possible metric names would be "diskPercentFull", "diskPercentBusy", + "kbReadPerSecond", etc.

+ + The general procedure for using a MetricsRecord is to fill in its tag and + metric values, and then call update() to pass the record to the + client library. + Metric data is not immediately sent to the metrics system + each time that update() is called. + An internal table is maintained, identified by the record name. This + table has columns + corresponding to the tag and the metric names, and rows + corresponding to each unique set of tag values. An update + either modifies an existing row in the table, or adds a new row with a set of + tag values that are different from all the other rows. Note that if there + are no tags, then there can be at most one row in the table.

+ + Once a row is added to the table, its data will be sent to the metrics system + on every timer period, whether or not it has been updated since the previous + timer period. If this is inappropriate, for example if metrics were being + reported by some transient object in an application, the remove() + method can be used to remove the row and thus stop the data from being + sent.

+ + Note that the update() method is atomic. This means that it is + safe for different threads to be updating the same metric. More precisely, + it is OK for different threads to call update() on MetricsRecord instances + with the same set of tag names and tag values. Different threads should + not use the same MetricsRecord instance at the same time.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MetricsContext.registerUpdater().]]> + + + + + +The API is abstract so that it can be implemented on top of +a variety of metrics client libraries. The choice of +client library is a configuration option, and different +modules within the same application can use +different metrics implementation libraries. +

+Sub-packages: +

+
org.apache.hadoop.metrics.spi
+
The abstract Server Provider Interface package. Those wishing to + integrate the metrics API with a particular metrics client library should + extend this package.
+ +
org.apache.hadoop.metrics.file
+
An implementation package which writes the metric data to + a file, or sends it to the standard output stream.
+ +
org.apache.hadoop.metrics.ganglia
+
An implementation package which sends metric data to + Ganglia.
+
+ +

Introduction to the Metrics API

+ +Here is a simple example of how to use this package to report a single +metric value: +
+    private ContextFactory contextFactory = ContextFactory.getFactory();
+    
+    void reportMyMetric(float myMetric) {
+        MetricsContext myContext = contextFactory.getContext("myContext");
+        MetricsRecord myRecord = myContext.getRecord("myRecord");
+        myRecord.setMetric("myMetric", myMetric);
+        myRecord.update();
+    }
+
+ +In this example there are three names: +
+
myContext
+
The context name will typically identify either the application, or else a + module within an application or library.
+ +
myRecord
+
The record name generally identifies some entity for which a set of + metrics are to be reported. For example, you could have a record named + "cacheStats" for reporting a number of statistics relating to the usage of + some cache in your application.
+ +
myMetric
+
This identifies a particular metric. For example, you might have metrics + named "cache_hits" and "cache_misses". +
+
+ +

Tags

+ +In some cases it is useful to have multiple records with the same name. For +example, suppose that you want to report statistics about each disk on a computer. +In this case, the record name would be something like "diskStats", but you also +need to identify the disk which is done by adding a tag to the record. +The code could look something like this: +
+    private MetricsRecord diskStats =
+            contextFactory.getContext("myContext").getRecord("diskStats");
+            
+    void reportDiskMetrics(String diskName, float diskBusy, float diskUsed) {
+        diskStats.setTag("diskName", diskName);
+        diskStats.setMetric("diskBusy", diskBusy);
+        diskStats.setMetric("diskUsed", diskUsed);
+        diskStats.update();
+    }
+
+ +

Buffering and Callbacks

+ +Data is not sent immediately to the metrics system when +MetricsRecord.update() is called. Instead it is stored in an +internal table, and the contents of the table are sent periodically. +This can be important for two reasons: +
    +
  1. It means that a programmer is free to put calls to this API in an + inner loop, since updates can be very frequent without slowing down + the application significantly.
  2. +
  3. Some implementations can gain efficiency by combining many metrics + into a single UDP message.
  4. +
+ +The API provides a timer-based callback via the +registerUpdater() method. The benefit of this +versus using java.util.Timer is that the callbacks will be done +immediately before sending the data, making the data as current as possible. + +

Configuration

+ +It is possible to programmatically examine and modify configuration data +before creating a context, like this: +
+    ContextFactory factory = ContextFactory.getFactory();
+    ... examine and/or modify factory attributes ...
+    MetricsContext context = factory.getContext("myContext");
+
+The factory attributes can be examined and modified using the following +ContextFactorymethods: +
    +
  • Object getAttribute(String attributeName)
  • +
  • String[] getAttributeNames()
  • +
  • void setAttribute(String name, Object value)
  • +
  • void removeAttribute(attributeName)
  • +
+ +

+ContextFactory.getFactory() initializes the factory attributes by +reading the properties file hadoop-metrics.properties if it exists +on the class path. + +

+A factory attribute named: +

+contextName.class
+
+should have as its value the fully qualified name of the class to be +instantiated by a call of the CodeFactory method +getContext(contextName). If this factory attribute is not +specified, the default is to instantiate +org.apache.hadoop.metrics.file.FileContext. + +

+Other factory attributes are specific to a particular implementation of this +API and are documented elsewhere. For example, configuration attributes for +the file and Ganglia implementations can be found in the javadoc for +their respective packages.]]> + + + + + + + + + + + + + + + + + + + + + + + fileName attribute, + if specified. Otherwise the data will be written to standard + output.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is configured by setting ContextFactory attributes which in turn + are usually configured through a properties file. All the attributes are + prefixed by the contextName. For example, the properties file might contain: +

+ myContextName.fileName=/tmp/metrics.log
+ myContextName.period=5
+ 
]]> +
+ + + + +These are the implementation specific factory attributes +(See ContextFactory.getFactory()): + +
+
contextName.fileName
+
The path of the file to which metrics in context contextName + are to be appended. If this attribute is not specified, the metrics + are written to standard output by default.
+ +
contextName.period
+
The period in seconds on which the metric data is written to the + file.
+ +
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +Implementation of the metrics package that sends metric data to +Ganglia. +Programmers should not normally need to use this package directly. Instead +they should use org.hadoop.metrics. + +

+These are the implementation specific factory attributes +(See ContextFactory.getFactory()): + +

+
contextName.servers
+
Space and/or comma separated sequence of servers to which UDP + messages should be sent.
+ +
contextName.period
+
The period in seconds on which the metric data is sent to the + server(s).
+ +
contextName.units.recordName.metricName
+
The units for the specified metric in the specified record.
+ +
contextName.slope.recordName.metricName
+
The slope for the specified metric in the specified record.
+ +
contextName.tmax.recordName.metricName
+
The tmax for the specified metric in the specified record.
+ +
contextName.dmax.recordName.metricName
+
The dmax for the specified metric in the specified record.
+ +
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contextName.tableName. The returned map consists of + those attributes with the contextName and tableName stripped off.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class implements the internal table of metric data, and the timer + on which data is to be sent to the metrics system. Subclasses must + override the abstract emitRecord method in order to transmit + the data.

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + update + and remove().]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hostname or hostname:port. If + the specs string is null, defaults to localhost:defaultPort. + + @return a list of InetSocketAddress objects.]]> + + + + + + + + + org.apache.hadoop.metrics.file and +org.apache.hadoop.metrics.ganglia.

+ +Plugging in an implementation involves writing a concrete subclass of +AbstractMetricsContext. The subclass should get its + configuration information using the getAttribute(attributeName) + method.]]> + + + + + + + + + + + + + ,name=" + Where the and are the supplied parameters + + @param serviceName + @param nameName + @param theMbean - the MBean to register + @return the named used to register the MBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.rpc.socket.factory.class.<ClassName>. When no + such parameter exists then fall back on the default socket factory as + configured by hadoop.rpc.socket.factory.class.default. If + this default socket factory is not configured, then fall back on the JVM + default socket factory. + + @param conf the configuration + @param clazz the class (usually a {@link VersionedProtocol}) + @return a socket factory]]> + + + + + + hadoop.rpc.socket.factory.default + + @param conf the configuration + @return the default socket factory as specified in the configuration or + the JVM default socket factory if the configuration does not + contain a default socket factory property.]]> + + + + + + + + + + + + + : + ://:/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketInputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getInputStream()} is returned. In the later + case, the timeout argument is ignored and the timeout set with + {@link Socket#setSoTimeout(int)} applies for reads.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see #getInputStream(Socket, long) + + @param socket + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + +
+ + From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketOutputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getOutputStream()} is returned. In the later + case, the timeout argument is ignored and the write will wait until + data is available.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see #getOutputStream(Socket, long) + + @param socket + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+
+ + + + + + + + + + + + + + + + + + + + + node + + @param node + a node + @return true if node is already in the tree; false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scope + if scope starts with ~, choose one from the all nodes except for the + ones in scope; otherwise, choose one from scope + @param scope range of nodes from which a node will be choosen + @return the choosen node]]> + + + + + + + scope but not in excludedNodes + if scope starts with ~, return the number of nodes that are not + in scope and excludedNodes; + @param scope a path string that may start with ~ + @param excludedNodes a list of nodes + @return number of available nodes]]> + + + + + + + + + + + + reader + It linearly scans the array, if a local node is found, swap it with + the first element of the array. + If a local rack node is found, swap it with the first element following + the local node. + If neither local node or local rack node is found, put a random replica + location at postion 0. + It leaves the rest nodes untouched.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + +
+ + Create a new ouput stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = getCount(). + @param newCapacity The new capacity in bytes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index idx = startVector(...); + while (!idx.done()) { + .... // read element of a vector + idx.incr(); + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Introduction + + Software systems of any significant complexity require mechanisms for data +interchange with the outside world. These interchanges typically involve the +marshaling and unmarshaling of logical units of data to and from data streams +(files, network connections, memory buffers etc.). Applications usually have +some code for serializing and deserializing the data types that they manipulate +embedded in them. The work of serialization has several features that make +automatic code generation for it worthwhile. Given a particular output encoding +(binary, XML, etc.), serialization of primitive types and simple compositions +of primitives (structs, vectors etc.) is a very mechanical task. Manually +written serialization code can be susceptible to bugs especially when records +have a large number of fields or a record definition changes between software +versions. Lastly, it can be very useful for applications written in different +programming languages to be able to share and interchange data. This can be +made a lot easier by describing the data records manipulated by these +applications in a language agnostic manner and using the descriptions to derive +implementations of serialization in multiple target languages. + +This document describes Hadoop Record I/O, a mechanism that is aimed +at +

    +
  • enabling the specification of simple serializable data types (records) +
  • enabling the generation of code in multiple target languages for +marshaling and unmarshaling such types +
  • providing target language specific support that will enable application +programmers to incorporate generated code into their applications +
+ +The goals of Hadoop Record I/O are similar to those of mechanisms such as XDR, +ASN.1, PADS and ICE. While these systems all include a DDL that enables +the specification of most record types, they differ widely in what else they +focus on. The focus in Hadoop Record I/O is on data marshaling and +multi-lingual support. We take a translator-based approach to serialization. +Hadoop users have to describe their data in a simple data description +language. The Hadoop DDL translator rcc generates code that users +can invoke in order to read/write their data from/to simple stream +abstractions. Next we list explicitly some of the goals and non-goals of +Hadoop Record I/O. + + +

Goals

+ +
    +
  • Support for commonly used primitive types. Hadoop should include as +primitives commonly used builtin types from programming languages we intend to +support. + +
  • Support for common data compositions (including recursive compositions). +Hadoop should support widely used composite types such as structs and +vectors. + +
  • Code generation in multiple target languages. Hadoop should be capable of +generating serialization code in multiple target languages and should be +easily extensible to new target languages. The initial target languages are +C++ and Java. + +
  • Support for generated target languages. Hadooop should include support +in the form of headers, libraries, packages for supported target languages +that enable easy inclusion and use of generated code in applications. + +
  • Support for multiple output encodings. Candidates include +packed binary, comma-separated text, XML etc. + +
  • Support for specifying record types in a backwards/forwards compatible +manner. This will probably be in the form of support for optional fields in +records. This version of the document does not include a description of the +planned mechanism, we intend to include it in the next iteration. + +
+ +

Non-Goals

+ +
    +
  • Serializing existing arbitrary C++ classes. +
  • Serializing complex data structures such as trees, linked lists etc. +
  • Built-in indexing schemes, compression, or check-sums. +
  • Dynamic construction of objects from an XML schema. +
+ +The remainder of this document describes the features of Hadoop record I/O +in more detail. Section 2 describes the data types supported by the system. +Section 3 lays out the DDL syntax with some examples of simple records. +Section 4 describes the process of code generation with rcc. Section 5 +describes target language mappings and support for Hadoop types. We include a +fairly complete description of C++ mappings with intent to include Java and +others in upcoming iterations of this document. The last section talks about +supported output encodings. + + +

Data Types and Streams

+ +This section describes the primitive and composite types supported by Hadoop. +We aim to support a set of types that can be used to simply and efficiently +express a wide range of record types in different programming languages. + +

Primitive Types

+ +For the most part, the primitive types of Hadoop map directly to primitive +types in high level programming languages. Special cases are the +ustring (a Unicode string) and buffer types, which we believe +find wide use and which are usually implemented in library code and not +available as language built-ins. Hadoop also supplies these via library code +when a target language built-in is not present and there is no widely +adopted "standard" implementation. The complete list of primitive types is: + +
    +
  • byte: An 8-bit unsigned integer. +
  • boolean: A boolean value. +
  • int: A 32-bit signed integer. +
  • long: A 64-bit signed integer. +
  • float: A single precision floating point number as described by + IEEE-754. +
  • double: A double precision floating point number as described by + IEEE-754. +
  • ustring: A string consisting of Unicode characters. +
  • buffer: An arbitrary sequence of bytes. +
+ + +

Composite Types

+Hadoop supports a small set of composite types that enable the description +of simple aggregate types and containers. A composite type is serialized +by sequentially serializing it constituent elements. The supported +composite types are: + +
    + +
  • record: An aggregate type like a C-struct. This is a list of +typed fields that are together considered a single unit of data. A record +is serialized by sequentially serializing its constituent fields. In addition +to serialization a record has comparison operations (equality and less-than) +implemented for it, these are defined as memberwise comparisons. + +
  • vector: A sequence of entries of the same data type, primitive +or composite. + +
  • map: An associative container mapping instances of a key type to +instances of a value type. The key and value types may themselves be primitive +or composite types. + +
+ +

Streams

+ +Hadoop generates code for serializing and deserializing record types to +abstract streams. For each target language Hadoop defines very simple input +and output stream interfaces. Application writers can usually develop +concrete implementations of these by putting a one method wrapper around +an existing stream implementation. + + +

DDL Syntax and Examples

+ +We now describe the syntax of the Hadoop data description language. This is +followed by a few examples of DDL usage. + +

Hadoop DDL Syntax

+ +

+recfile = *include module *record
+include = "include" path
+path = (relative-path / absolute-path)
+module = "module" module-name
+module-name = name *("." name)
+record := "class" name "{" 1*(field) "}"
+field := type name ";"
+name :=  ALPHA (ALPHA / DIGIT / "_" )*
+type := (ptype / ctype)
+ptype := ("byte" / "boolean" / "int" |
+          "long" / "float" / "double"
+          "ustring" / "buffer")
+ctype := (("vector" "<" type ">") /
+          ("map" "<" type "," type ">" ) ) / name)
+
+ +A DDL file describes one or more record types. It begins with zero or +more include declarations, a single mandatory module declaration +followed by zero or more class declarations. The semantics of each of +these declarations are described below: + +
    + +
  • include: An include declaration specifies a DDL file to be +referenced when generating code for types in the current DDL file. Record types +in the current compilation unit may refer to types in all included files. +File inclusion is recursive. An include does not trigger code +generation for the referenced file. + +
  • module: Every Hadoop DDL file must have a single module +declaration that follows the list of includes and precedes all record +declarations. A module declaration identifies a scope within which +the names of all types in the current file are visible. Module names are +mapped to C++ namespaces, Java packages etc. in generated code. + +
  • class: Records types are specified through class +declarations. A class declaration is like a Java class declaration. +It specifies a named record type and a list of fields that constitute records +of the type. Usage is illustrated in the following examples. + +
+ +

Examples

+ +
    +
  • A simple DDL file links.jr with just one record declaration. +
    
    +module links {
    +    class Link {
    +        ustring URL;
    +        boolean isRelative;
    +        ustring anchorText;
    +    };
    +}
    +
    + +
  • A DDL file outlinks.jr which includes another +
    
    +include "links.jr"
    +
    +module outlinks {
    +    class OutLinks {
    +        ustring baseURL;
    +        vector outLinks;
    +    };
    +}
    +
    +
+ +

Code Generation

+ +The Hadoop translator is written in Java. Invocation is done by executing a +wrapper shell script named named rcc. It takes a list of +record description files as a mandatory argument and an +optional language argument (the default is Java) --language or +-l. Thus a typical invocation would look like: +

+$ rcc -l C++  ...
+
+ + +

Target Language Mappings and Support

+ +For all target languages, the unit of code generation is a record type. +For each record type, Hadoop generates code for serialization and +deserialization, record comparison and access to record members. + +

C++

+ +Support for including Hadoop generated C++ code in applications comes in the +form of a header file recordio.hh which needs to be included in source +that uses Hadoop types and a library librecordio.a which applications need +to be linked with. The header declares the Hadoop C++ namespace which defines +appropriate types for the various primitives, the basic interfaces for +records and streams and enumerates the supported serialization encodings. +Declarations of these interfaces and a description of their semantics follow: + +

+namespace hadoop {
+
+  enum RecFormat { kBinary, kXML, kCSV };
+
+  class InStream {
+  public:
+    virtual ssize_t read(void *buf, size_t n) = 0;
+  };
+
+  class OutStream {
+  public:
+    virtual ssize_t write(const void *buf, size_t n) = 0;
+  };
+
+  class IOError : public runtime_error {
+  public:
+    explicit IOError(const std::string& msg);
+  };
+
+  class IArchive;
+  class OArchive;
+
+  class RecordReader {
+  public:
+    RecordReader(InStream& in, RecFormat fmt);
+    virtual ~RecordReader(void);
+
+    virtual void read(Record& rec);
+  };
+
+  class RecordWriter {
+  public:
+    RecordWriter(OutStream& out, RecFormat fmt);
+    virtual ~RecordWriter(void);
+
+    virtual void write(Record& rec);
+  };
+
+
+  class Record {
+  public:
+    virtual std::string type(void) const = 0;
+    virtual std::string signature(void) const = 0;
+  protected:
+    virtual bool validate(void) const = 0;
+
+    virtual void
+    serialize(OArchive& oa, const std::string& tag) const = 0;
+
+    virtual void
+    deserialize(IArchive& ia, const std::string& tag) = 0;
+  };
+}
+
+ +
    + +
  • RecFormat: An enumeration of the serialization encodings supported +by this implementation of Hadoop. + +
  • InStream: A simple abstraction for an input stream. This has a +single public read method that reads n bytes from the stream into +the buffer buf. Has the same semantics as a blocking read system +call. Returns the number of bytes read or -1 if an error occurs. + +
  • OutStream: A simple abstraction for an output stream. This has a +single write method that writes n bytes to the stream from the +buffer buf. Has the same semantics as a blocking write system +call. Returns the number of bytes written or -1 if an error occurs. + +
  • RecordReader: A RecordReader reads records one at a time from +an underlying stream in a specified record format. The reader is instantiated +with a stream and a serialization format. It has a read method that +takes an instance of a record and deserializes the record from the stream. + +
  • RecordWriter: A RecordWriter writes records one at a +time to an underlying stream in a specified record format. The writer is +instantiated with a stream and a serialization format. It has a +write method that takes an instance of a record and serializes the +record to the stream. + +
  • Record: The base class for all generated record types. This has two +public methods type and signature that return the typename and the +type signature of the record. + +
+ +Two files are generated for each record file (note: not for each record). If a +record file is named "name.jr", the generated files are +"name.jr.cc" and "name.jr.hh" containing serialization +implementations and record type declarations respectively. + +For each record in the DDL file, the generated header file will contain a +class definition corresponding to the record type, method definitions for the +generated type will be present in the '.cc' file. The generated class will +inherit from the abstract class hadoop::Record. The DDL files +module declaration determines the namespace the record belongs to. +Each '.' delimited token in the module declaration results in the +creation of a namespace. For instance, the declaration module docs.links +results in the creation of a docs namespace and a nested +docs::links namespace. In the preceding examples, the Link class +is placed in the links namespace. The header file corresponding to +the links.jr file will contain: + +

+namespace links {
+  class Link : public hadoop::Record {
+    // ....
+  };
+};
+
+ +Each field within the record will cause the generation of a private member +declaration of the appropriate type in the class declaration, and one or more +acccessor methods. The generated class will implement the serialize and +deserialize methods defined in hadoop::Record+. It will also +implement the inspection methods type and signature from +hadoop::Record. A default constructor and virtual destructor will also +be generated. Serialization code will read/write records into streams that +implement the hadoop::InStream and the hadoop::OutStream interfaces. + +For each member of a record an accessor method is generated that returns +either the member or a reference to the member. For members that are returned +by value, a setter method is also generated. This is true for primitive +data members of the types byte, int, long, boolean, float and +double. For example, for a int field called MyField the folowing +code is generated. + +

+...
+private:
+  int32_t mMyField;
+  ...
+public:
+  int32_t getMyField(void) const {
+    return mMyField;
+  };
+
+  void setMyField(int32_t m) {
+    mMyField = m;
+  };
+  ...
+
+ +For a ustring or buffer or composite field. The generated code +only contains accessors that return a reference to the field. A const +and a non-const accessor are generated. For example: + +

+...
+private:
+  std::string mMyBuf;
+  ...
+public:
+
+  std::string& getMyBuf() {
+    return mMyBuf;
+  };
+
+  const std::string& getMyBuf() const {
+    return mMyBuf;
+  };
+  ...
+
+ +

Examples

+ +Suppose the inclrec.jr file contains: +

+module inclrec {
+    class RI {
+        int      I32;
+        double   D;
+        ustring  S;
+    };
+}
+
+ +and the testrec.jr file contains: + +

+include "inclrec.jr"
+module testrec {
+    class R {
+        vector VF;
+        RI            Rec;
+        buffer        Buf;
+    };
+}
+
+ +Then the invocation of rcc such as: +

+$ rcc -l c++ inclrec.jr testrec.jr
+
+will result in generation of four files: +inclrec.jr.{cc,hh} and testrec.jr.{cc,hh}. + +The inclrec.jr.hh will contain: + +

+#ifndef _INCLREC_JR_HH_
+#define _INCLREC_JR_HH_
+
+#include "recordio.hh"
+
+namespace inclrec {
+  
+  class RI : public hadoop::Record {
+
+  private:
+
+    int32_t      I32;
+    double       D;
+    std::string  S;
+
+  public:
+
+    RI(void);
+    virtual ~RI(void);
+
+    virtual bool operator==(const RI& peer) const;
+    virtual bool operator<(const RI& peer) const;
+
+    virtual int32_t getI32(void) const { return I32; }
+    virtual void setI32(int32_t v) { I32 = v; }
+
+    virtual double getD(void) const { return D; }
+    virtual void setD(double v) { D = v; }
+
+    virtual std::string& getS(void) const { return S; }
+    virtual const std::string& getS(void) const { return S; }
+
+    virtual std::string type(void) const;
+    virtual std::string signature(void) const;
+
+  protected:
+
+    virtual void serialize(hadoop::OArchive& a) const;
+    virtual void deserialize(hadoop::IArchive& a);
+  };
+} // end namespace inclrec
+
+#endif /* _INCLREC_JR_HH_ */
+
+
+ +The testrec.jr.hh file will contain: + + +

+
+#ifndef _TESTREC_JR_HH_
+#define _TESTREC_JR_HH_
+
+#include "inclrec.jr.hh"
+
+namespace testrec {
+  class R : public hadoop::Record {
+
+  private:
+
+    std::vector VF;
+    inclrec::RI        Rec;
+    std::string        Buf;
+
+  public:
+
+    R(void);
+    virtual ~R(void);
+
+    virtual bool operator==(const R& peer) const;
+    virtual bool operator<(const R& peer) const;
+
+    virtual std::vector& getVF(void) const;
+    virtual const std::vector& getVF(void) const;
+
+    virtual std::string& getBuf(void) const ;
+    virtual const std::string& getBuf(void) const;
+
+    virtual inclrec::RI& getRec(void) const;
+    virtual const inclrec::RI& getRec(void) const;
+    
+    virtual bool serialize(hadoop::OutArchive& a) const;
+    virtual bool deserialize(hadoop::InArchive& a);
+    
+    virtual std::string type(void) const;
+    virtual std::string signature(void) const;
+  };
+}; // end namespace testrec
+#endif /* _TESTREC_JR_HH_ */
+
+
+ +

Java

+ +Code generation for Java is similar to that for C++. A Java class is generated +for each record type with private members corresponding to the fields. Getters +and setters for fields are also generated. Some differences arise in the +way comparison is expressed and in the mapping of modules to packages and +classes to files. For equality testing, an equals method is generated +for each record type. As per Java requirements a hashCode method is also +generated. For comparison a compareTo method is generated for each +record type. This has the semantics as defined by the Java Comparable +interface, that is, the method returns a negative integer, zero, or a positive +integer as the invoked object is less than, equal to, or greater than the +comparison parameter. + +A .java file is generated per record type as opposed to per DDL +file as in C++. The module declaration translates to a Java +package declaration. The module name maps to an identical Java package +name. In addition to this mapping, the DDL compiler creates the appropriate +directory hierarchy for the package and places the generated .java +files in the correct directories. + +

Mapping Summary

+ +

+DDL Type        C++ Type            Java Type 
+
+boolean         bool                boolean
+byte            int8_t              byte
+int             int32_t             int
+long            int64_t             long
+float           float               float
+double          double              double
+ustring         std::string         java.lang.String
+buffer          std::string         org.apache.hadoop.record.Buffer
+class type      class type          class type
+vector    std::vector   java.util.ArrayList
+map  std::map java.util.TreeMap
+
+ +

Data encodings

+ +This section describes the format of the data encodings supported by Hadoop. +Currently, three data encodings are supported, namely binary, CSV and XML. + +

Binary Serialization Format

+ +The binary data encoding format is fairly dense. Serialization of composite +types is simply defined as a concatenation of serializations of the constituent +elements (lengths are included in vectors and maps). + +Composite types are serialized as follows: +
    +
  • class: Sequence of serialized members. +
  • vector: The number of elements serialized as an int. Followed by a +sequence of serialized elements. +
  • map: The number of key value pairs serialized as an int. Followed +by a sequence of serialized (key,value) pairs. +
+ +Serialization of primitives is more interesting, with a zero compression +optimization for integral types and normalization to UTF-8 for strings. +Primitive types are serialized as follows: + +
    +
  • byte: Represented by 1 byte, as is. +
  • boolean: Represented by 1-byte (0 or 1) +
  • int/long: Integers and longs are serialized zero compressed. +Represented as 1-byte if -120 <= value < 128. Otherwise, serialized as a +sequence of 2-5 bytes for ints, 2-9 bytes for longs. The first byte represents +the number of trailing bytes, N, as the negative number (-120-N). For example, +the number 1024 (0x400) is represented by the byte sequence 'x86 x04 x00'. +This doesn't help much for 4-byte integers but does a reasonably good job with +longs without bit twiddling. +
  • float/double: Serialized in IEEE 754 single and double precision +format in network byte order. This is the format used by Java. +
  • ustring: Serialized as 4-byte zero compressed length followed by +data encoded as UTF-8. Strings are normalized to UTF-8 regardless of native +language representation. +
  • buffer: Serialized as a 4-byte zero compressed length followed by the +raw bytes in the buffer. +
+ + +

CSV Serialization Format

+ +The CSV serialization format has a lot more structure than the "standard" +Excel CSV format, but we believe the additional structure is useful because + +
    +
  • it makes parsing a lot easier without detracting too much from legibility +
  • the delimiters around composites make it obvious when one is reading a +sequence of Hadoop records +
+ +Serialization formats for the various types are detailed in the grammar that +follows. The notable feature of the formats is the use of delimiters for +indicating the certain field types. + +
    +
  • A string field begins with a single quote ('). +
  • A buffer field begins with a sharp (#). +
  • A class, vector or map begins with 's{', 'v{' or 'm{' respectively and +ends with '}'. +
+ +The CSV format can be described by the following grammar: + +

+record = primitive / struct / vector / map
+primitive = boolean / int / long / float / double / ustring / buffer
+
+boolean = "T" / "F"
+int = ["-"] 1*DIGIT
+long = ";" ["-"] 1*DIGIT
+float = ["-"] 1*DIGIT "." 1*DIGIT ["E" / "e" ["-"] 1*DIGIT]
+double = ";" ["-"] 1*DIGIT "." 1*DIGIT ["E" / "e" ["-"] 1*DIGIT]
+
+ustring = "'" *(UTF8 char except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" )
+
+buffer = "#" *(BYTE except NULL, LF, % and , / "%00" / "%0a" / "%25" / "%2c" )
+
+struct = "s{" record *("," record) "}"
+vector = "v{" [record *("," record)] "}"
+map = "m{" [*(record "," record)] "}"
+
+ +

XML Serialization Format

+ +The XML serialization format is the same used by Apache XML-RPC +(http://ws.apache.org/xmlrpc/types.html). This is an extension of the original +XML-RPC format and adds some additional data types. All record I/O types are +not directly expressible in this format, and access to a DDL is required in +order to convert these to valid types. All types primitive or composite are +represented by <value> elements. The particular XML-RPC type is +indicated by a nested element in the <value> element. The encoding for +records is always UTF-8. Primitive types are serialized as follows: + +
    +
  • byte: XML tag <ex:i1>. Values: 1-byte unsigned +integers represented in US-ASCII +
  • boolean: XML tag <boolean>. Values: "0" or "1" +
  • int: XML tags <i4> or <int>. Values: 4-byte +signed integers represented in US-ASCII. +
  • long: XML tag <ex:i8>. Values: 8-byte signed integers +represented in US-ASCII. +
  • float: XML tag <ex:float>. Values: Single precision +floating point numbers represented in US-ASCII. +
  • double: XML tag <double>. Values: Double precision +floating point numbers represented in US-ASCII. +
  • ustring: XML tag <;string>. Values: String values +represented as UTF-8. XML does not permit all Unicode characters in literal +data. In particular, NULLs and control chars are not allowed. Additionally, +XML processors are required to replace carriage returns with line feeds and to +replace CRLF sequences with line feeds. Programming languages that we work +with do not impose these restrictions on string types. To work around these +restrictions, disallowed characters and CRs are percent escaped in strings. +The '%' character is also percent escaped. +
  • buffer: XML tag <string&>. Values: Arbitrary binary +data. Represented as hexBinary, each byte is replaced by its 2-byte +hexadecimal representation. +
+ +Composite types are serialized as follows: + +
    +
  • class: XML tag <struct>. A struct is a sequence of +<member> elements. Each <member> element has a <name> +element and a <value> element. The <name> is a string that must +match /[a-zA-Z][a-zA-Z0-9_]*/. The value of the member is represented +by a <value> element. + +
  • vector: XML tag <array<. An <array> contains a +single <data> element. The <data> element is a sequence of +<value> elements each of which represents an element of the vector. + +
  • map: XML tag <array>. Same as vector. + +
+ +For example: + +

+class {
+  int           MY_INT;            // value 5
+  vector MY_VEC;            // values 0.1, -0.89, 2.45e4
+  buffer        MY_BUF;            // value '\00\n\tabc%'
+}
+
+ +is serialized as + +

+<value>
+  <struct>
+    <member>
+      <name>MY_INT</name>
+      <value><i4>5</i4></value>
+    </member>
+    <member>
+      <name>MY_VEC</name>
+      <value>
+        <array>
+          <data>
+            <value><ex:float>0.1</ex:float></value>
+            <value><ex:float>-0.89</ex:float></value>
+            <value><ex:float>2.45e4</ex:float></value>
+          </data>
+        </array>
+      </value>
+    </member>
+    <member>
+      <name>MY_BUF</name>
+      <value><string>%00\n\tabc%25</string></value>
+    </member>
+  </struct>
+</value> 
+
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This task takes the given record definition files and compiles them into + java or c++ + files. It is then up to the user to compile the generated files. + +

The task requires the file or the nested fileset element to be + specified. Optional attributes are language (set the output + language, default is "java"), + destdir (name of the destination directory for generated java/c++ + code, default is ".") and failonerror (specifies error handling + behavior. default is true). +

Usage

+
+ <recordcc
+       destdir="${basedir}/gensrc"
+       language="java">
+   <fileset include="**\/*.jr" />
+ </recordcc>
+ 
]]> +
+
+ +
+ + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi as a comma separated string in + conf as a property attr + + The String starts with the user name followed by the default group names, + and other group names. + + @param conf configuration + @param attr property name + @param ugi a UnixUserGroupInformation]]> + + + + + + + + conf + + The object is expected to store with the property name attr + as a comma separated string that starts + with the user name followed by group names. + If the property name is not defined, return null. + It's assumed that there is only one UGI per user. If this user already + has a UGI in the ugi map, return the ugi in the map. + Otherwise, construct a UGI from the configuration, store it in the + ugi map and return it. + + @param conf configuration + @param attr property name + @return a UnixUGI + @throws LoginException if the stored string is ill-formatted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool supports archiving and anaylzing (sort/grep) of log-files. + It takes as input + a) Input uri which will serve uris of the logs to be archived. + b) Output directory (not mandatory). + b) Directory on dfs to archive the logs. + c) The sort/grep patterns for analyzing the files and separator for boundaries. + Usage: + Logalyzer -archive -archiveDir -analysis -logs -grep -sort -separator +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GenericOptionsParser to parse only the generic Hadoop + arguments. + + The array of string arguments other than the generic arguments can be + obtained by {@link #getRemainingArgs()}. + + @param conf the Configuration to modify. + @param args command-line arguments.]]> + + + + + GenericOptionsParser to parse given options as well + as generic Hadoop options. + + The resulting CommandLine object can be obtained by + {@link #getCommandLine()}. + + @param conf the configuration to modify + @param options options built by the caller + @param args User-specified arguments]]> + + + + + Strings containing the un-parsed arguments.]]> + + + + + CommandLine object + to process the parsed arguments. + + Note: If the object is created with + {@link #GenericOptionsParser(Configuration, String[])}, then returned + object will only contain parsed generic options. + + @return CommandLine representing list of arguments + parsed against Options descriptor.]]> + + + + + + + + + + GenericOptionsParser is a utility to parse command line + arguments generic to the Hadoop framework. + + GenericOptionsParser recognizes several standarad command + line arguments, enabling applications to easily specify a namenode, a + jobtracker, additional configuration resources etc. + +

Generic Options

+ +

The supported generic options are:

+

+     -conf <configuration file>     specify a configuration file
+     -D <property=value>            use value for given property
+     -fs <local|namenode:port>      specify a namenode
+     -jt <local|jobtracker:port>    specify a job tracker
+ 

+ +

The general command line syntax is:

+

+ bin/hadoop command [genericOptions] [commandOptions]
+ 

+ +

Generic command line arguments might modify + Configuration objects, given to constructors.

+ +

The functionality is implemented using Commons CLI.

+ +

Examples:

+

+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+ 
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+     
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+     
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+ 

+ + @see Tool + @see ToolRunner]]> +
+
+ + + + + + + + + Class<T>) of the + argument of type T. + @param The type of the argument + @param t the object to get it class + @return Class<T>]]> + + + + + + + List<T> to a an array of + T[]. + @param c the Class object of the items in the list + @param list the list to convert]]> + + + + + + List<T> to a an array of + T[]. + @param list the list to convert + @throws ArrayIndexOutOfBoundsException if the list is empty. + Use {@link #toArray(Class, List)} if the list may be empty.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-hadoop is loaded, + else false]]> + + + + + + true if native hadoop libraries, if present, can be + used for this job; false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { pq.top().change(); pq.adjustTop(); } + instead of
+  { o = pq.pop(); o.change(); pq.push(o); }
+ 
]]> +
+
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clients and/or applications can use the provided Progressable + to explicitly report progress to the Hadoop framework. This is especially + important for operations which take an insignificant amount of time since, + in-lieu of the reported progress, the framework has to assume that an error + has occured and time-out the operation.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop Pipes + or Hadoop Streaming. + + It also checks to ensure that we are running on a *nix platform else + (e.g. in Cygwin/Windows) it returns null. + @param job job configuration + @return a String[] with the ulimit command arguments or + null if we are running on a non *nix platform or + if the limit is unspecified.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell interface. + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell can be used to run unix commands like du or + df. It also offers facilities to gate commands by + time-intervals.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ShellCommandExecutorshould be used in cases where the output + of the command needs no explicit parsing and where the command, working + directory and the environment remains unchanged. The output of the command + is stored as-is and is expected to be small.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the char to be escaped + @return an escaped string]]> + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the escaped char + @return an unescaped string]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tool, is the standard for any Map-Reduce tool/application. + The tool/application should delegate the handling of + + standard command-line options to {@link ToolRunner#run(Tool, String[])} + and only handle its custom arguments.

+ +

Here is how a typical Tool is implemented:

+

+     public class MyApp extends Configured implements Tool {
+     
+       public int run(String[] args) throws Exception {
+         // Configuration processed by ToolRunner
+         Configuration conf = getConf();
+         
+         // Create a JobConf using the processed conf
+         JobConf job = new JobConf(conf, MyApp.class);
+         
+         // Process custom command-line options
+         Path in = new Path(args[1]);
+         Path out = new Path(args[2]);
+         
+         // Specify various job-specific parameters     
+         job.setJobName("my-app");
+         job.setInputPath(in);
+         job.setOutputPath(out);
+         job.setMapperClass(MyApp.MyMapper.class);
+         job.setReducerClass(MyApp.MyReducer.class);
+
+         // Submit the job, then poll for progress until the job is complete
+         JobClient.runJob(job);
+       }
+       
+       public static void main(String[] args) throws Exception {
+         // Let ToolRunner handle generic command-line options 
+         int res = ToolRunner.run(new Configuration(), new Sort(), args);
+         
+         System.exit(res);
+       }
+     }
+ 

+ + @see GenericOptionsParser + @see ToolRunner]]> +
+
+ + + + + + + + + + + + Tool by {@link Tool#run(String[])}, after + parsing with the given generic arguments. Uses the given + Configuration, or builds one if null. + + Sets the Tool's configuration with the possibly modified + version of the conf. + + @param conf Configuration for the Tool. + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + Tool with its Configuration. + + Equivalent to run(tool.getConf(), tool, args). + + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + + + ToolRunner can be used to run classes implementing + Tool interface. It works in conjunction with + {@link GenericOptionsParser} to parse the + + generic hadoop command line arguments and modifies the + Configuration of the Tool. The + application-specific options are passed along without being modified. +

+ + @see Tool + @see GenericOptionsParser]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
diff --git a/lib/jdiff/hadoop_0.18.1.xml b/lib/jdiff/hadoop_0.18.1.xml new file mode 100644 index 00000000000..fd844cbed0f --- /dev/null +++ b/lib/jdiff/hadoop_0.18.1.xml @@ -0,0 +1,44778 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + final. + + @param name resource to be added, the classpath is examined for a file + with that name.]]> + + + + + + final. + + @param url url of the resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param file file-path of resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + name property, null if + no such property exists. + + Values are processed for variable expansion + before being returned. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + name property, without doing + variable expansion. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + + value of the name property. + + @param name property name. + @param value property value.]]> + + + + + + + name property. If no such property + exists, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value, or defaultValue if the property + doesn't exist.]]> + + + + + + + name property as an int. + + If no such property exists, or if the specified value is not a valid + int, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as an int, + or defaultValue.]]> + + + + + + + name property to an int. + + @param name property name. + @param value int value of the property.]]> + + + + + + + name property as a long. + If no such property is specified, or if the specified value is not a valid + long, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a long, + or defaultValue.]]> + + + + + + + name property to a long. + + @param name property name. + @param value long value of the property.]]> + + + + + + + name property as a float. + If no such property is specified, or if the specified value is not a valid + float, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a float, + or defaultValue.]]> + + + + + + + name property as a boolean. + If no such property is specified, or if the specified value is not a valid + boolean, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a boolean, + or defaultValue.]]> + + + + + + + name property to a boolean. + + @param name property name. + @param value boolean value of the property.]]> + + + + + + + + + + + + + name property as + a collection of Strings. + If no such property is specified then empty collection is returned. +

+ This is an optimized version of {@link #getStrings(String)} + + @param name property name. + @return property value as a collection of Strings.]]> + + + + + + name property as + an array of Strings. + If no such property is specified then null is returned. + + @param name property name. + @return property value as an array of Strings, + or null.]]> + + + + + + + name property as + an array of Strings. + If no such property is specified then default value is returned. + + @param name property name. + @param defaultValue The default value + @return property value as an array of Strings, + or default value.]]> + + + + + + + name property as + as comma delimited values. + + @param name property name. + @param values The values]]> + + + + + + + + + + + + + + name property as a Class. + If no such property is specified, then defaultValue is + returned. + + @param name the class name. + @param defaultValue default value. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property as a Class + implementing the interface specified by xface. + + If no such property is specified, then defaultValue is + returned. + + An exception is thrown if the returned class does not implement the named + interface. + + @param name the class name. + @param defaultValue default value. + @param xface the interface implemented by the named class. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property to the name of a + theClass implementing the given interface xface. + + An exception is thrown if theClass does not implement the + interface xface. + + @param name property name. + @param theClass property value. + @param xface the interface implemented by the named class.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + + + + + name. + + @param name configuration resource name. + @return an input stream attached to the resource.]]> + + + + + + name. + + @param name configuration resource name. + @return a reader attached to the resource.]]> + + + + + String + key-value pairs in the configuration. + + @return an iterator over the entries.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + true to set quiet-mode on, false + to turn it off.]]> + + + + + + + + + + + Resources + +

Configurations are specified by resources. A resource contains a set of + name/value pairs as XML data. Each resource is named by either a + String or by a {@link Path}. If named by a String, + then the classpath is examined for a file with that name. If named by a + Path, then the local filesystem is examined directly, without + referring to the classpath. + +

Hadoop by default specifies two resources, loaded in-order from the + classpath:

    +
  1. hadoop-default.xml + : Read-only defaults for hadoop.
  2. +
  3. hadoop-site.xml: Site-specific configuration for a given hadoop + installation.
  4. +
+ Applications may add additional resources, which are loaded + subsequent to these resources in the order they are added. + +

Final Parameters

+ +

Configuration parameters may be declared final. + Once a resource declares a value final, no subsequently-loaded + resource can alter that value. + For example, one might define a final parameter with: +

+  <property>
+    <name>dfs.client.buffer.dir</name>
+    <value>/tmp/hadoop/dfs/client</value>
+    <final>true</final>
+  </property>
+ + Administrators typically define parameters as final in + hadoop-site.xml for values that user applications may not alter. + +

Variable Expansion

+ +

Value strings are first processed for variable expansion. The + available properties are:

    +
  1. Other properties defined in this Configuration; and, if a name is + undefined here,
  2. +
  3. Properties in {@link System#getProperties()}.
  4. +
+ +

For example, if a configuration resource contains the following property + definitions: +

+  <property>
+    <name>basedir</name>
+    <value>/user/${user.name}</value>
+  </property>
+  
+  <property>
+    <name>tempdir</name>
+    <value>${basedir}/tmp</value>
+  </property>
+ + When conf.get("tempdir") is called, then ${basedir} + will be resolved to another property in this Configuration, while + ${user.name} would then ordinarily be resolved to the value + of the System property with that name.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The balancer is a tool that balances disk space usage on an HDFS cluster + when some datanodes become full or when new empty nodes join the cluster. + The tool is deployed as an application program that can be run by the + cluster administrator on a live HDFS cluster while applications + adding and deleting files. + +

SYNOPSIS +

+ To start:
+      bin/start-balancer.sh [-threshold ]
+      Example: bin/ start-balancer.sh 
+                     start the balancer with a default threshold of 10%
+               bin/ start-balancer.sh -threshold 5
+                     start the balancer with a threshold of 5%
+ To stop:
+      bin/ stop-balancer.sh
+ 
+ +

DESCRIPTION +

The threshold parameter is a fraction in the range of (0%, 100%) with a + default value of 10%. The threshold sets a target for whether the cluster + is balanced. A cluster is balanced if for each datanode, the utilization + of the node (ratio of used space at the node to total capacity of the node) + differs from the utilization of the (ratio of used space in the cluster + to total capacity of the cluster) by no more than the threshold value. + The smaller the threshold, the more balanced a cluster will become. + It takes more time to run the balancer for small threshold values. + Also for a very small threshold the cluster may not be able to reach the + balanced state when applications write and delete files concurrently. + +

The tool moves blocks from highly utilized datanodes to poorly + utilized datanodes iteratively. In each iteration a datanode moves or + receives no more than the lesser of 10G bytes or the threshold fraction + of its capacity. Each iteration runs no more than 20 minutes. + At the end of each iteration, the balancer obtains updated datanodes + information from the namenode. + +

A system property that limits the balancer's use of bandwidth is + defined in the default configuration file: +

+ 
+   dfs.balance.bandwidthPerSec
+   1048576
+   Specifies the maximum bandwidth that each datanode 
+ can utilize for the balancing purpose in term of the number of bytes 
+ per second. 
+ 
+ 
+ +

This property determines the maximum speed at which a block will be + moved from one datanode to another. The default value is 1MB/s. The higher + the bandwidth, the faster a cluster can reach the balanced state, + but with greater competition with application processes. If an + administrator changes the value of this property in the configuration + file, the change is observed when HDFS is next restarted. + +

MONITERING BALANCER PROGRESS +

After the balancer is started, an output file name where the balancer + progress will be recorded is printed on the screen. The administrator + can monitor the running of the balancer by reading the output file. + The output shows the balancer's status iteration by iteration. In each + iteration it prints the starting time, the iteration number, the total + number of bytes that have been moved in the previous iterations, + the total number of bytes that are left to move in order for the cluster + to be balanced, and the number of bytes that are being moved in this + iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left + To Move" is decreasing. + +

Running multiple instances of the balancer in an HDFS cluster is + prohibited by the tool. + +

The balancer automatically exits when any of the following five + conditions is satisfied: +

    +
  1. The cluster is balanced; +
  2. No block can be moved; +
  3. No block has been moved for five consecutive iterations; +
  4. An IOException occurs while communicating with the namenode; +
  5. Another balancer is running. +
+ +

Upon exit, a balancer returns an exit code and prints one of the + following messages to the output file in corresponding to the above exit + reasons: +

    +
  1. The cluster is balanced. Exiting +
  2. No block can be moved. Exiting... +
  3. No block has been moved for 3 iterations. Exiting... +
  4. Received an IO exception: failure reason. Exiting... +
  5. Another balancer is running. Exiting... +
+ +

The administrator can interrupt the execution of the balancer at any + time by running the command "stop-balancer.sh" on the machine where the + balancer is running.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + in]]> + + + + + + + out.]]> + + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + stream of bytes (of BLOCK_SIZE or less) + + This info is stored on a local disk. The DataNode + reports the table's contents to the NameNode upon startup + and every so often afterwards. + + DataNodes spend their lives in an endless loop of asking + the NameNode for something to do. A NameNode cannot connect + to a DataNode directly; a NameNode simply returns values from + functions invoked by a DataNode. + + DataNodes maintain an open server socket so that client code + or other DataNodes can read/write data. The host/port for + this server is reported to the NameNode, which then sends that + information to clients or other DataNodes that might be interested.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The tool scans all files and directories, starting from an indicated + root path. The following abnormal conditions are detected and handled:

+
    +
  • files with blocks that are completely missing from all datanodes.
    + In this case the tool can perform one of the following actions: +
      +
    • none ({@link NamenodeFsck#FIXING_NONE})
    • +
    • move corrupted files to /lost+found directory on DFS + ({@link NamenodeFsck#FIXING_MOVE}). Remaining data blocks are saved as a + block chains, representing longest consecutive series of valid blocks.
    • +
    • delete corrupted files ({@link NamenodeFsck#FIXING_DELETE})
    • +
    +
  • +
  • detect files with under-replicated or over-replicated blocks
  • +
+ Additionally, the tool collects a detailed overall DFS statistics, and + optionally can print detailed statistics on block locations and replication + factors of each file. + The tool also provides and option to filter open files during the scan.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + :/data[/] HTTP/1.1 + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #filesTotal}.set()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + :/listPaths[/][[&option]*] HTTP/1.1 + } + + Where option (default) in: + recursive ("no") + filter (".*") + exclude ("\..*\.crc") + + Response: A flat list of files/directories in the following format: + {@code +

+ + + + }]]> +
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The name-node can be started with one of the following startup options: +
    +
  • {@link FSConstants.StartupOption#REGULAR REGULAR} - normal startup
  • +
  • {@link FSConstants.StartupOption#FORMAT FORMAT} - format name node
  • +
  • {@link FSConstants.StartupOption#UPGRADE UPGRADE} - start the cluster + upgrade and create a snapshot of the current file system state
  • +
  • {@link FSConstants.StartupOption#ROLLBACK ROLLBACK} - roll the + cluster back to the previous state
  • +
+ The option is passed via configuration field: + dfs.namenode.startup + + The conf will be modified to reflect the actual ports on which + the NameNode is up and running if the user passes the port as + zero in the conf. + + @param conf confirguration + @throws IOException]]> +
+
+ + + + zero.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + datanode whose + total size is size + + @param datanode on which blocks are located + @param size total size of blocks]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + blocksequence (namespace) + 2) block->machinelist ("inodes") + + The first table is stored on disk and is very precious. + The second table is rebuilt every time the NameNode comes + up. + + 'NameNode' refers to both this class as well as the 'NameNode server'. + The 'FSNamesystem' class actually performs most of the filesystem + management. The majority of the 'NameNode' class itself is concerned + with exposing the IPC interface to the outside world, plus some + configuration management. + + NameNode implements the ClientProtocol interface, which allows + clients to ask for DFS services. ClientProtocol is not + designed for direct use by authors of DFS client code. End-users + should instead use the org.apache.nutch.hadoop.fs.FileSystem class. + + NameNode also implements the DatanodeProtocol interface, used by + DataNode programs that actually store DFS data blocks. These + methods are invoked repeatedly and automatically by all the + DataNodes in a DFS deployment. + + NameNode also implements the NamenodeProtocol interface, used by + secondary namenodes or rebalancing processes to get partial namenode's + state, for example partial blocksMap etc.]]> + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The tool scans all files and directories, starting from an indicated + root path. The following abnormal conditions are detected and handled:

+
    +
  • files with blocks that are completely missing from all datanodes.
    + In this case the tool can perform one of the following actions: +
      +
    • none ({@link #FIXING_NONE})
    • +
    • move corrupted files to /lost+found directory on DFS + ({@link #FIXING_MOVE}). Remaining data blocks are saved as a + block chains, representing longest consecutive series of valid blocks.
    • +
    • delete corrupted files ({@link #FIXING_DELETE})
    • +
    +
  • +
  • detect files with under-replicated or over-replicated blocks
  • +
+ Additionally, the tool collects a detailed overall DFS statistics, and + optionally can print detailed statistics on block locations and replication + factors of each file.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #syncs}.inc()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #blocksRead}.inc()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        dfs.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        dfs.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically. +

+ Name Node Status info is reported in another MBean + @see org.apache.hadoop.dfs.datanode.metrics.FSDatasetMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Data Node runtime statistic info is report in another MBean + @see org.apache.hadoop.dfs.datanode.metrics.DataNodeStatisticsMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Name Node runtime statistic info is report in another MBean + @see org.apache.hadoop.dfs.namenode.metrics.NameNodeStatisticsMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        dfs.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        dfs.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically. +

+ Name Node Status info is report in another MBean + @see org.apache.hadoop.dfs.namenode.metrics.FSNamesystemMBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DistributedCache is a facility provided by the Map-Reduce + framework to cache files (text, archives, jars etc.) needed by applications. +

+ +

Applications specify the files, via urls (hdfs:// or http://) to be cached + via the {@link JobConf}. The DistributedCache assumes that the + files specified via hdfs:// urls are already present on the + {@link FileSystem} at the path specified by the url.

+ +

The framework will copy the necessary files on to the slave node before + any tasks for the job are executed on that node. Its efficiency stems from + the fact that the files are only copied once per job and the ability to + cache archives which are un-archived on the slaves.

+ +

DistributedCache can be used to distribute simple, read-only + data/text files and/or more complex types such as archives, jars etc. + Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. + Jars may be optionally added to the classpath of the tasks, a rudimentary + software distribution mechanism. Files have execution permissions. + Optionally users can also direct it to symlink the distributed cache file(s) + into the working directory of the task.

+ +

DistributedCache tracks modification timestamps of the cache + files. Clearly the cache files should not be modified by the application + or externally while the job is executing.

+ +

Here is an illustrative example on how to use the + DistributedCache:

+

+     // Setting up the cache for the application
+     
+     1. Copy the requisite files to the FileSystem:
+     
+     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
+     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
+     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+     
+     2. Setup the application's JobConf:
+     
+     JobConf job = new JobConf();
+     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
+                                   job);
+     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+     
+     3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+     
+     public static class MapClass extends MapReduceBase  
+     implements Mapper<K, V, K, V> {
+     
+       private Path[] localArchives;
+       private Path[] localFiles;
+       
+       public void configure(JobConf job) {
+         // Get the cached archives/files
+         localArchives = DistributedCache.getLocalCacheArchives(job);
+         localFiles = DistributedCache.getLocalCacheFiles(job);
+       }
+       
+       public void map(K key, V value, 
+                       OutputCollector<K, V> output, Reporter reporter) 
+       throws IOException {
+         // Use data from the cached archives/files here
+         // ...
+         // ...
+         output.collect(k, v);
+       }
+     }
+     
+ 

+ + @see JobConf + @see JobClient]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BufferedFSInputStream + with the specified buffer size, + and saves its argument, the input stream + in, for later use. An internal + buffer array of length size + is created and stored in buf. + + @param in the underlying input stream. + @param size the buffer size. + @exception IllegalArgumentException if size <= 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + setReplication of FileSystem + @param src file name + @param replication new replication + @throws IOException + @return true if successful; + false if file does not exist or is a directory]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ']]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fs.scheme.class whose value names the FileSystem class. + The entire URI is passed to the FileSystem instance's initialize method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Return all the files that match filePattern and are not checksum + files. Results are sorted by their names. + +

+ A filename pattern is composed of regular characters and + special pattern matching characters, which are: + +

+
+
+

+

? +
Matches any single character. + +

+

* +
Matches zero or more characters. + +

+

[abc] +
Matches a single character from character set + {a,b,c}. + +

+

[a-b] +
Matches a single character from the character range + {a...b}. Note that character a must be + lexicographically less than or equal to character b. + +

+

[^a] +
Matches a single character that is not from character set or range + {a}. Note that the ^ character must occur + immediately to the right of the opening bracket. + +

+

\c +
Removes (escapes) any special meaning of character c. + +

+

{ab,cd} +
Matches a string from the string set {ab, cd} + +

+

{ab,c{de,fh}} +
Matches a string from the string set {ab, cde, cfh} + +
+
+
+ + @param pathPattern a regular expression specifying a pth pattern + + @return an array of paths that match the path pattern + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All user code that may potentially use the Hadoop Distributed + File System should be written to use a FileSystem object. The + Hadoop DFS is a multi-machine system that appears as a single + disk. It's useful because of its fault tolerance and potentially + very large capacity. + +

+ The local implementation is {@link LocalFileSystem} and distributed + implementation is {@link DistributedFileSystem}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FilterFileSystem contains + some other file system, which it uses as + its basic file system, possibly transforming + the data along the way or providing additional + functionality. The class FilterFileSystem + itself simply overrides all methods of + FileSystem with versions that + pass all requests to the contained file + system. Subclasses of FilterFileSystem + may further override some of these methods + and may also provide additional methods + and fields.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + buf at offset + and checksum into checksum. + The method is used for implementing read, therefore, it should be optimized + for sequential reading + @param pos chunkPos + @param buf desitination buffer + @param offset offset in buf at which to store data + @param len maximun number of bytes to read + @return number of bytes read]]> + + + + + + + + + + + + + + + + + -1 if the end of the + stream is reached. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + This method implements the general contract of the corresponding + {@link InputStream#read(byte[], int, int) read} method of + the {@link InputStream} class. As an additional + convenience, it attempts to read as many bytes as possible by repeatedly + invoking the read method of the underlying stream. This + iterated read continues until one of the following + conditions becomes true:

    + +
  • The specified number of bytes have been read, + +
  • The read method of the underlying stream returns + -1, indicating end-of-file. + +
If the first read on the underlying stream returns + -1 to indicate end-of-file then this method returns + -1. Otherwise this method returns the number of bytes + actually read. + + @param b destination buffer. + @param off offset at which to start storing bytes. + @param len maximum number of bytes to read. + @return the number of bytes read, or -1 if the end of + the stream has been reached. + @exception IOException if an I/O error occurs. + ChecksumException if any checksum error occurs]]> +
+ + + + + + + + + + + + n bytes of data from the + input stream. + +

This method may skip more bytes than are remaining in the backing + file. This produces no exception and the number of bytes skipped + may include some number of bytes that were beyond the EOF of the + backing file. Attempting to read from the stream after skipping past + the end will result in -1 indicating the end of the file. + +

If n is negative, no bytes are skipped. + + @param n the number of bytes to be skipped. + @return the actual number of bytes skipped. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to skip to is corrupted]]> + + + + + + + This method may seek past the end of the file. + This produces no exception and an attempt to read from + the stream will result in -1 indicating the end of the file. + + @param pos the postion to seek to. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to seek to is corrupted]]> + + + + + + + + + + len bytes from + stm + + @param stm an input stream + @param buf destiniation buffer + @param offset offset at which to store data + @param len number of bytes to read + @return actual number of bytes read + @throws IOException if there is any IO error]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + len bytes from the specified byte array + starting at offset off and generate a checksum for + each data chunk. + +

This method stores bytes from the given array into this + stream's buffer before it gets checksumed. The buffer gets checksumed + and flushed to the underlying output stream when all data + in a checksum chunk are in the buffer. If the buffer is empty and + requested length is at least as large as the size of next checksum chunk + size, this method will checksum and write the chunk directly + to the underlying output stream. Thus it avoids uneccessary data copy. + + @param b the data. + @param off the start offset in the data. + @param len the number of bytes to write. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if and only if pathname + should be included]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + trash feature. Files are moved to a user's trash + directory, a subdirectory of their home directory named ".Trash". Files are + initially moved to a current sub-directory of the trash directory. + Within that sub-directory their original path is preserved. Periodically + one may checkpoint the current trash and remove older checkpoints. (This + design permits trash management without enumeration of the full trash + content, without date support in the filesystem, and without clock + synchronization.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} backed by an FTP client provided by Apache Commons Net. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is a tool for migrating data from an older to a newer version + of an S3 filesystem. +

+

+ All files in the filesystem are migrated by re-writing the block metadata + - no datafiles are touched. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + Extracts AWS credentials from the filesystem URI or configuration. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A block-based {@link FileSystem} backed by + Amazon S3. +

+ @see NativeS3FileSystem]]> +
+
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If f is a file, this method will make a single call to S3. + If f is a directory, this method will make a maximum of + (n / 1000) + 2 calls to S3, where n is the total number of + files and directories contained directly in f. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} for reading and writing files stored on + Amazon S3. + Unlike {@link org.apache.hadoop.fs.s3.S3FileSystem} this implementation + stores files on S3 in their + native form so they can be read by other S3 tools. +

+ @see org.apache.hadoop.fs.s3.S3FileSystem]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nth value.]]> + + + + + + + + + + + + + + + + + + + + + nth value in the file.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + public class IntArrayWritable extends ArrayWritable { + public IntArrayWritable() { + super(IntWritable.class); + } + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a ByteWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataInputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using DataInput methods ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataOutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using DataOutput methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + the class of the item + @param conf the configuration to store + @param item the object to be stored + @param keyName the name of the key to use + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param items the objects to be stored + @param keyName the name of the key to use + @throws IndexOutOfBoundsException if the items array is empty + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + DefaultStringifier offers convenience methods to store/load objects to/from + the configuration. + + @param the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a DoubleWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a FloatWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When two sequence files, which have same Key type but different Value + types, are mapped out to reduce, multiple Value types is not allowed. + In this case, this class can help you wrap instances with different types. +

+ +

+ Compared with ObjectWritable, this class is much more effective, + because ObjectWritable will append the class declaration as a String + into the output file in every Key-Value pair. +

+ +

+ Generic Writable implements {@link Configurable} interface, so that it will be + configured by the framework. The configuration is passed to the wrapped objects + implementing {@link Configurable} interface before deserialization. +

+ + how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines + the classes which will be wrapped in GenericObject in application. + Attention: this classes defined in getTypes() method, must + implement Writable interface. +

+ + The code looks like this: +
+ public class GenericObject extends GenericWritable {
+ 
+   private static Class[] CLASSES = {
+               ClassType1.class, 
+               ClassType2.class,
+               ClassType3.class,
+               };
+
+   protected Class[] getTypes() {
+       return CLASSES;
+   }
+
+ }
+ 
+ + @since Nov 8, 2006]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new InputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using InputStream methods ...
+ }
+ 
+ @see DataInputBuffer + @see DataOutput]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a IntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + closes the input and output streams + at the end. + @param in InputStrem to read from + @param out OutputStream to write to + @param conf the Configuration object]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ignore any {@link IOException} or + null pointers. Must only be used for cleanup in exception handlers. + @param log the log to record problems to at debug level. Can be null. + @param closeables the objects to close]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a LongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A map is a directory containing two files, the data file, + containing all keys and values in the map, and a smaller index + file, containing a fraction of the keys. The fraction is determined by + {@link Writer#getIndexInterval()}. + +

The index file is read entirely into memory. Thus key implementations + should try to keep themselves small. + +

Map files are created by adding entries in-order. To maintain a large + database, perform updates by copying the previous version of a database and + merging in a sorted change list, to create a new version of the database in + a new file. Sorting large change lists can be done with {@link + SequenceFile.Sorter}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key and + val. Returns true if such a pair exists and false when at + the end of the map]]> + + + + + + + + + + + + + + + + key or if it does not exist, at the first entry + after the named key. + +- * @param key - key that we're trying to find +- * @param val - data value if key is found +- * @return - the key that was the closest match or null if eof.]]> + + + + + + + + + key does not exist, return + the first entry that falls just before the key. Otherwise, + return the record that sorts just after. + @return - the key that was the closest match or null if eof.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is an MD5Hash whose digest contains the + same values.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new OutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ OutputBuffer buffer = new OutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using OutputStream methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
+ @see DataOutputBuffer + @see InputBuffer]]> +
+
+ + + + + + + + + + + + + + + A {@link Comparator} that operates directly on byte representations of + objects. +

+ @param + @see DeserializerComparator]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SequenceFiles are flat files consisting of binary key/value + pairs. + +

SequenceFile provides {@link Writer}, {@link Reader} and + {@link Sorter} classes for writing, reading and sorting respectively.

+ + There are three SequenceFile Writers based on the + {@link CompressionType} used to compress key/value pairs: +
    +
  1. + Writer : Uncompressed records. +
  2. +
  3. + RecordCompressWriter : Record-compressed files, only compress + values. +
  4. +
  5. + BlockCompressWriter : Block-compressed files, both keys & + values are collected in 'blocks' + separately and compressed. The size of + the 'block' is configurable. +
+ +

The actual compression algorithm used to compress key and/or values can be + specified by using the appropriate {@link CompressionCodec}.

+ +

The recommended way is to use the static createWriter methods + provided by the SequenceFile to chose the preferred format.

+ +

The {@link Reader} acts as the bridge and can read any of the above + SequenceFile formats.

+ +

SequenceFile Formats

+ +

Essentially there are 3 different formats for SequenceFiles + depending on the CompressionType specified. All of them share a + common header described below. + +

+
    +
  • + version - 3 bytes of magic header SEQ, followed by 1 byte of actual + version number (e.g. SEQ4 or SEQ6) +
  • +
  • + keyClassName -key class +
  • +
  • + valueClassName - value class +
  • +
  • + compression - A boolean which specifies if compression is turned on for + keys/values in this file. +
  • +
  • + blockCompression - A boolean which specifies if block-compression is + turned on for keys/values in this file. +
  • +
  • + compression codec - CompressionCodec class which is used for + compression of keys and/or values (if compression is + enabled). +
  • +
  • + metadata - {@link Metadata} for this file. +
  • +
  • + sync - A sync marker to denote end of the header. +
  • +
+ +
Uncompressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Record-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Compressed Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Block-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record Block +
      +
    • Compressed key-lengths block-size
    • +
    • Compressed key-lengths block
    • +
    • Compressed keys block-size
    • +
    • Compressed keys block
    • +
    • Compressed value-lengths block-size
    • +
    • Compressed value-lengths block
    • +
    • Compressed values block-size
    • +
    • Compressed values block
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +

The compressed blocks of key lengths and value lengths consist of the + actual lengths of individual keys/values encoded in ZeroCompressedInteger + format.

+ + @see CompressionCodec]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key, skipping its + value. True if another entry exists, and false at end of file.]]> + + + + + + + + key and + val. Returns true if such a pair exists and false when at + end of file]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The position passed must be a position returned by {@link + SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary + position, use {@link SequenceFile.Reader#sync(long)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SegmentDescriptor + @param segments the list of SegmentDescriptors + @param tmpDir the directory to write temporary files into + @return RawKeyValueIterator + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For best performance, applications should make sure that the {@link + Writable#readFields(DataInput)} implementation of their keys is + very efficient. In particular, it should avoid allocating memory.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This always returns a synchronized position. In other words, + immediately after calling {@link SequenceFile.Reader#seek(long)} with a position + returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However + the key may be earlier in the file than key last written when this + method was called (e.g., with block-compression, it may be the first key + in the block that was being written when this method was called).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key. Returns + true if such a key exists and false when at the end of the set.]]> + + + + + + + key. + Returns key, or null if no match exists.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + position. Note that this + method avoids using the converter or doing String instatiation + @return the Unicode scalar value at position or -1 + if the position is invalid or points to a + trailing byte]]> + + + + + + + + + + what in the backing + buffer, starting as position start. The starting + position is measured in bytes and the return value is in + terms of byte position in the buffer. The backing buffer is + not converted to a string for this operation. + @return byte position of the first occurence of the search + string in the UTF-8 buffer or -1 if not found]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a Text with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException.]]> + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException. + @return ByteBuffer: bytes stores at ByteBuffer.array() + and length is ByteBuffer.limit()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In + addition, it provides methods for string traversal without converting the + byte array to a string.

Also includes utilities for + serializing/deserialing a string, coding/decoding a string, checking if a + byte array contains valid UTF8 code, calculating the length of an encoded + string.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a UTF8 with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + Also includes utilities for efficiently reading and writing UTF-8. + + @deprecated replaced by Text]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This is useful when a class may evolve, so that instances written by the + old version of the class may still be processed by the new version. To + handle this situation, {@link #readFields(DataInput)} + implementations should catch {@link VersionMismatchException}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VIntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VLongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + out. + + @param out DataOuput to serialize this object into. + @throws IOException]]> + + + + + + + in. + +

For efficiency, implementations should attempt to re-use storage in the + existing object where possible.

+ + @param in DataInput to deseriablize this object from. + @throws IOException]]> +
+ + + Any key or value type in the Hadoop Map-Reduce + framework implements this interface.

+ +

Implementations typically implement a static read(DataInput) + method which constructs a new instance, calls {@link #readFields(DataInput)} + and returns the instance.

+ +

Example:

+

+     public class MyWritable implements Writable {
+       // Some data     
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public static MyWritable read(DataInput in) throws IOException {
+         MyWritable w = new MyWritable();
+         w.readFields(in);
+         return w;
+       }
+     }
+ 

]]> +
+ + + + + + + + WritableComparables can be compared to each other, typically + via Comparators. Any type which is to be used as a + key in the Hadoop Map-Reduce framework should implement this + interface.

+ +

Example:

+

+     public class MyWritableComparable implements WritableComparable {
+       // Some data
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public int compareTo(MyWritableComparable w) {
+         int thisValue = this.value;
+         int thatValue = ((IntWritable)o).value;
+         return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+       }
+     }
+ 

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The default implementation reads the data into two {@link + WritableComparable}s (using {@link + Writable#readFields(DataInput)}, then calls {@link + #compare(WritableComparable,WritableComparable)}.]]> + + + + + + + The default implementation uses the natural ordering, calling {@link + Comparable#compareTo(Object)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This base implemenation uses the natural ordering. To define alternate + orderings, override {@link #compare(WritableComparable,WritableComparable)}. + +

One may optimize compare-intensive operations by overriding + {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are + provided to assist in optimized implementations of this method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum type + @param in DataInput to read from + @param enumType Class type of Enum + @return Enum represented by String read from DataInput + @throws IOException]]> + + + + + + + + + + + + + + + + len number of bytes in input streamin + @param in input stream + @param len number of bytes to skip + @throws IOException when skipped less number of bytes]]> + + + + + + + + + + + + + + CompressionCodec for which to get the + Compressor + @return Compressor for the given + CompressionCodec from the pool or a new one]]> + + + + + + CompressionCodec for which to get the + Decompressor + @return Decompressor for the given + CompressionCodec the pool or a new one]]> + + + + + + Compressor to be returned to the pool]]> + + + + + + Decompressor to be returned to the + pool]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Implementations are assumed to be buffered. This permits clients to + reposition the underlying input stream then call {@link #resetState()}, + without having to also synchronize client buffers.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + true if a preset dictionary is needed for decompression. + @return true if a preset dictionary is needed for decompression]]> + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-lzo library is loaded & initialized; + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair. + http://www.oberhumer.com/opensource/lzo/]]> + + + + + + + + + + + + + + + + + + + + + true if lzo compressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if lzo decompressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-zlib is loaded & initialized + and can be loaded for this job, else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keep trying a limited number of times, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+ + + + + + + + Keep trying for a maximum time, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by the number of tries so far. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by a random + number in the range of [0, 2 to the number of retries) +

]]> +
+
+ + + + + + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + + + A retry policy for RemoteException + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + Try once, and fail by re-throwing the exception. + This corresponds to having no retry mechanism in place. +

]]> +
+
+ + + + Try once, and fail silently for void methods, or by + re-throwing the exception for non-void methods. +

]]> +
+
+ + + + Keep trying forever. +

]]> +
+
+ + + A collection of useful implementations of {@link RetryPolicy}. +

]]> +
+
+ + + + + + + + + + Determines whether the framework should retry a + method for the given exception, and the number + of retries that have been made for that operation + so far. +

+ @param e The exception that caused the method to fail. + @param retries The number of times the method has been retried. + @return true if the method should be retried, + false if the method should not be retried + but shouldn't fail with an exception (only for void methods). + @throws Exception The re-thrown exception e indicating + that the method failed and should not be retried further.]]> +
+
+ + + Specifies a policy for retrying method failures. + Implementations of this interface should be immutable. +

]]> +
+
+ + + + + + + + + + + + Create a proxy for an interface of an implementation class + using the same retry policy for each method in the interface. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param retryPolicy the policy for retirying method call failures + @return the retry proxy]]> +
+
+ + + + + + + Create a proxy for an interface of an implementation class + using the a set of retry policies specified by method name. + If no retry policy is defined for a method then a default of + {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param methodNameToPolicyMap a map of method names to retry policies + @return the retry proxy]]> +
+
+ + + A factory for creating retry proxies. +

]]> +
+
+ +
+ + + + + + + + Prepare the deserializer for reading.

]]> +
+
+ + + + + + Deserialize the next object from the underlying input stream. + If the object t is non-null then this deserializer + may set its internal state to the next object read from the input + stream. Otherwise, if the object t is null a new + deserialized object will be created. +

+ @return the deserialized object]]> +
+
+ + + + Close the underlying input stream and clear up any resources.

]]> +
+
+ + + Provides a facility for deserializing objects of type from an + {@link InputStream}. +

+ +

+ Deserializers are stateful, but must not buffer the input since + other producers may read from the input between calls to + {@link #deserialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + A {@link RawComparator} that uses a {@link Deserializer} to deserialize + the objects to be compared so that the standard {@link Comparator} can + be used to compare them. +

+

+ One may optimize compare-intensive operations by using a custom + implementation of {@link RawComparator} that operates directly + on byte representations. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + An experimental {@link Serialization} for Java {@link Serializable} classes. +

+ @see JavaSerializationComparator]]> +
+
+ + + + + + + + + + + + + A {@link RawComparator} that uses a {@link JavaSerialization} + {@link Deserializer} to deserialize objects that are then compared via + their {@link Comparable} interfaces. +

+ @param + @see JavaSerialization]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + Encapsulates a {@link Serializer}/{@link Deserializer} pair. +

+ @param ]]> +
+
+ + + + + + + Serializations are found by reading the io.serializations + property from conf, which is a comma-delimited list of + classnames. +

]]> +
+
+ + + + + + + + + + + + A factory for {@link Serialization}s. +

]]> +
+
+ + + + + + + + Prepare the serializer for writing.

]]> +
+
+ + + + + Serialize t to the underlying output stream.

]]> +
+
+ + + + Close the underlying output stream and clear up any resources.

]]> +
+
+ + + Provides a facility for serializing objects of type to an + {@link OutputStream}. +

+ +

+ Serializers are stateful, but must not buffer the output since + other producers may write to the output between calls to + {@link #serialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param, to the IPC server running at + address, returning the value. Throws exceptions if there are + network problems or if the remote code threw an exception.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unwraps any IOException. + + @param lookupTypes the desired exception class. + @return IOException, which is either the lookupClass exception or this.]]> + + + + + This unwraps any Throwable that has a constructor taking + a String as a parameter. + Otherwise it returns this. + + @return Throwable]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + protocol is a Java interface. All parameters and return types must + be one of: + +
  • a primitive type, boolean, byte, + char, short, int, long, + float, double, or void; or
  • + +
  • a {@link String}; or
  • + +
  • a {@link Writable}; or
  • + +
  • an array of the above types
+ + All methods in the protocol should throw only IOException. No field data of + the protocol instance is transmitted.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + handlerCount determines + the number of handler threads that will be used to process calls.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #rpcQueueTime}.inc(time)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker, + as {@link JobTracker.State} + + @return the current state of the JobTracker.]]> + + + + + + + + + + + + ClusterStatus provides clients with information such as: +

    +
  1. + Size of the cluster. +
  2. +
  3. + Task capacity of the cluster. +
  4. +
  5. + The number of currently running map & reduce tasks. +
  6. +
  7. + State of the JobTracker. +
  8. +

+ +

Clients can query for the latest ClusterStatus, via + {@link JobClient#getClusterStatus()}.

+ + @see JobClient]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter can be of + any {@link Enum} type.

+ +

Counters are bunched into {@link Group}s, each comprising of + counters from a particular Enum class.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of counters, comprising of counters from a particular + counter {@link Enum} class. + +

Grouphandles localization of the class name and the + counter names.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param fs the file system that the file is on + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides a generic implementation of + {@link #getSplits(JobConf, int)}. + Subclasses of FileInputFormat can also override the + {@link #isSplitable(FileSystem, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s.]]> + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the attemptid, say + attempt_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in ${mapred.work.output.dir} during execution + of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the + framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

Note: the value of ${mapred.work.output.dir} during + execution of a particular task-attempt is actually + ${mapred.output.dir}/_temporary/_{$taskid}, and this value is + set by the map-reduce framework. So, just create any side-files in the + path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce + task to take advantage of this feature.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This method is used to validate the input directories when a job is + submitted so that the {@link JobClient} can fail early, with an useful + error message, in case of errors. For e.g. input directory does not exist. +

+ + @param job job configuration. + @throws InvalidInputException if the job does not have valid input + @deprecated getSplits is called in the client and can perform any + necessary validation of the input]]> +
+
+ + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. + + @param job job configuration. + @param numSplits the desired number of splits, a hint. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + It is the responsibility of the RecordReader to respect + record boundaries while processing the logical split to present a + record-oriented view to the individual task.

+ + @param split the {@link InputSplit} + @param job the job that this split belongs to + @return a {@link RecordReader}]]> +
+
+ + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibilty to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see JobClient + @see FileInputFormat]]> + + + + + + + + + + InputSplit. + + @return the number of bytes in the input split. + @throws IOException]]> + + + + + + InputSplit is + located as an array of Strings. + @throws IOException]]> + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + jobid doesn't correspond to any known job. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient is the primary interface for the user-job to interact + with the {@link JobTracker}. + + JobClient provides facilities to submit jobs, track their + progress, access component-tasks' reports/logs, get the Map-Reduce cluster + status information etc. + +

The job submission process involves: +

    +
  1. + Checking the input and output specifications of the job. +
  2. +
  3. + Computing the {@link InputSplit}s for the job. +
  4. +
  5. + Setup the requisite accounting information for the {@link DistributedCache} + of the job, if necessary. +
  6. +
  7. + Copying the job's jar and configuration to the map-reduce system directory + on the distributed file-system. +
  8. +
  9. + Submitting the job to the JobTracker and optionally monitoring + it's status. +
  10. +

+ + Normally the user creates the application, describes various facets of the + job via {@link JobConf} and then uses the JobClient to submit + the job and monitor its progress. + +

Here is an example on how to use JobClient:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     job.setInputPath(new Path("in"));
+     job.setOutputPath(new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+
+     // Submit the job, then poll for progress until the job is complete
+     JobClient.runJob(job);
+ 

+ +

Job Control

+ +

At times clients would chain map-reduce jobs to accomplish complex tasks + which cannot be done via a single map-reduce job. This is fairly easy since + the output of the job, typically, goes to distributed file-system and that + can be used as the input for the next job.

+ +

However, this also means that the onus on ensuring jobs are complete + (success/failure) lies squarely on the clients. In such situations the + various job-control options are: +

    +
  1. + {@link #runJob(JobConf)} : submits the job and returns only after + the job has completed. +
  2. +
  3. + {@link #submitJob(JobConf)} : only submits the job, then poll the + returned handle to the {@link RunningJob} to query status and make + scheduling decisions. +
  4. +
  5. + {@link JobConf#setJobEndNotificationURI(String)} : setup a notification + on job-completion, thus avoiding polling. +
  6. +

+ + @see JobConf + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if framework should keep the intermediate files + for failed tasks, false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Note: +

+ @param dir the {@link Path} of the output directory for the map-reduce job.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the outputs of the maps are to be compressed, + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This comparator should be provided if the equivalence rules for keys + for sorting the intermediates are different from those for grouping keys + before each call to + {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ +

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + in a single call to the reduce function if K1 and K2 compare as equal.

+ +

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + how keys are sorted, this can be used in conjunction to simulate + secondary sort on values.

+ +

Note: This is not a guarantee of the reduce sort being + stable in any sense. (In any case, with the order of available + map-outputs to the reduce being non-deterministic, it wouldn't make + that much sense.)

+ + @param theClass the comparator class to be used for grouping keys. + It should implement RawComparator. + @see #setOutputKeyComparatorClass(Class)]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. Typically the combiner is same as the + the {@link Reducer} for the job i.e. {@link #getReducerClass()}. + + @return the user-defined combiner class used to combine map-outputs.]]> + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. + +

The combiner is a task-level aggregation operation which, in some cases, + helps to cut down the amount of data transferred from the {@link Mapper} to + the {@link Reducer}, leading to better performance.

+ +

Typically the combiner is same as the Reducer for the + job i.e. {@link #setReducerClass(Class)}.

+ + @param theClass the user-defined combiner class used to combine + map-outputs.]]> +
+
+ + + + + + + + + + + true. + + @return true if speculative execution be used for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on, else false.]]> + + + + + true. + + @return true if speculative execution be + used for this job for map tasks, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for map tasks, + else false.]]> + + + + + true. + + @return true if speculative execution be used + for reduce tasks for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for reduce tasks, + else false.]]> + + + + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + Note: This is only a hint to the framework. The actual + number of spawned map tasks depends on the number of {@link InputSplit}s + generated by the job's {@link InputFormat#getSplits(JobConf, int)}. + + A custom {@link InputFormat} is typically used to accurately control + the number of map tasks for the job.

+ +

How many maps?

+ +

The number of maps is usually driven by the total size of the inputs + i.e. total number of blocks of the input files.

+ +

The right level of parallelism for maps seems to be around 10-100 maps + per-node, although it has been set up to 300 or so for very cpu-light map + tasks. Task setup takes awhile, so it is best if the maps take at least a + minute to execute.

+ +

The default behavior of file-based {@link InputFormat}s is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of input files. However, the {@link FileSystem} blocksize of the + input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Thus, if you expect 10TB of input data and have a blocksize of 128MB, + you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is + used to set it even higher.

+ + @param n the number of map tasks for this job. + @see InputFormat#getSplits(JobConf, int) + @see FileInputFormat + @see FileSystem#getDefaultBlockSize() + @see FileStatus#getBlockSize()]]> +
+
+ + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + How many reduces? + +

The right number of reduces seems to be 0.95 or + 1.75 multiplied by (<no. of nodes> * + + mapred.tasktracker.reduce.tasks.maximum). +

+ +

With 0.95 all of the reduces can launch immediately and + start transfering map outputs as the maps finish. With 1.75 + the faster nodes will finish their first round of reduces and launch a + second wave of reduces doing a much better job of load balancing.

+ +

Increasing the number of reduces increases the framework overhead, but + increases load balancing and lowers the cost of failures.

+ +

The scaling factors above are slightly less than whole numbers to + reserve a few reduce slots in the framework for speculative-tasks, failures + etc.

+ +

Reducer NONE

+ +

It is legal to set the number of reduce-tasks to zero.

+ +

In this case the output of the map-tasks directly go to distributed + file-system, to the path set by + {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the + framework doesn't sort the map-outputs before writing it out to HDFS.

+ + @param n the number of reduce tasks for this job.]]> +
+
+ + + mapred.map.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per map task.]]> + + + + + + + + + + + mapred.reduce.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per reduce task.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noFailures, the + tasktracker is blacklisted for this job. + + @param noFailures maximum no. of failures of a given job per tasktracker.]]> + + + + + blacklisted for this job. + + @return the maximum no. of failures of a given job per tasktracker.]]> + + + + + failed. + + Defaults to zero, i.e. any failed map-task results in + the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + failed. + + Defaults to zero, i.e. any failed reduce-task results + in the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The debug script can aid debugging of failed map tasks. The script is + given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script needs to be symlinked.

+ +

Here is an example on how to submit a script +

+ job.setMapDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param mDbgScript the script name]]> +
+
+ + + + + + + + + The debug script can aid debugging of failed reduce tasks. The script + is given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script file needs to be symlinked

+ +

Here is an example on how to submit a script +

+ job.setReduceDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param rDbgScript the script name]]> +
+
+ + + + + + + + null if it hasn't + been set. + @see #setJobEndNotificationURI(String)]]> + + + + + + The uri can contain 2 special parameters: $jobId and + $jobStatus. Those, if present, are replaced by the job's + identifier and completion-status respectively.

+ +

This is typically used by application-writers to implement chaining of + Map-Reduce jobs in an asynchronous manner.

+ + @param uri the job end notification uri + @see JobStatus + @see Job Completion and Chaining]]> +
+
+ + + + When a job starts, a shared directory is created at location + + ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ . + This directory is exposed to the users through + job.local.dir . + So, the tasks can use this space + as scratch space and share files among them.

+ This value is available as System property also. + + @return The localized job specific shared directory]]> +
+
+ + JobConf is the primary interface for a user to describe a + map-reduce job to the Hadoop framework for execution. The framework tries to + faithfully execute the job as-is described by JobConf, however: +
    +
  1. + Some configuration parameters might have been marked as + + final by administrators and hence cannot be altered. +
  2. +
  3. + While some job parameters are straight-forward to set + (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly + rest of the framework and/or job-configuration and is relatively more + complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}). +
  4. +

+ +

JobConf typically specifies the {@link Mapper}, combiner + (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and + {@link OutputFormat} implementations to be used etc. + +

Optionally JobConf is used to specify other advanced facets + of the job such as Comparators to be used, files to be put in + the {@link DistributedCache}, whether or not intermediate and/or job outputs + are to be compressed (and how), debugability via user-provided scripts + ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), + for doing post-processing on task logs, task's stdout, stderr, syslog. + and etc.

+ +

Here is an example on how to configure a job via JobConf:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     FileInputFormat.setInputPaths(job, new Path("in"));
+     FileOutputFormat.setOutputPath(job, new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setCombinerClass(MyJob.MyReducer.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+     
+     job.setInputFormat(SequenceFileInputFormat.class);
+     job.setOutputFormat(SequenceFileOutputFormat.class);
+ 

+ + @see JobClient + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + any job + run on the jobtracker started at 200707121733, we would use : +
 
+ JobID.getTaskIDsPattern("200707121733", null);
+ 
+ which will return : +
 "job_200707121733_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @return a regex pattern matching JobIDs]]> +
+
+ + + An example JobID is : + job_200707121733_0003 , which represents the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse JobID strings, but rather + use appropriate constructors or {@link #forName(String)} method. + + @see TaskID + @see TaskAttemptID + @see JobTracker#getNewJobId() + @see JobTracker#getStartTime()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -archives + -files inputjar args]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + zero. + + @param conf configuration for the JobTracker. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + io.file.buffer.size specified in the given + Configuration. + @param in input stream + @param conf configuration + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Output pairs need not be of the same types as input pairs. A given + input pair may map to zero or many output pairs. Output pairs are + collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the input key. + @param value the input value. + @param output collects mapped keys and values. + @param reporter facility to report progress.]]> +
+ + + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link JobConf} for the + job via the {@link JobConfigurable#configure(JobConf)} and initialize + themselves. Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

The framework then calls + {@link #map(Object, Object, OutputCollector, Reporter)} + for each key/value pair in the InputSplit for that task.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the grouping by specifying + a Comparator via + {@link JobConf#setOutputKeyComparatorClass(Class)}.

+ +

The grouped Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

The intermediate, grouped outputs are always stored in + {@link SequenceFile}s. Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the JobConf.

+ +

If the job has + zero + reduces then the output of the Mapper is directly written + to the {@link FileSystem} without grouping by keys.

+ +

Example:

+

+     public class MyMapper<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Mapper<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+       
+       private String mapTaskId;
+       private String inputFile;
+       private int noRecords = 0;
+       
+       public void configure(JobConf job) {
+         mapTaskId = job.get("mapred.task.id");
+         inputFile = job.get("mapred.input.file");
+       }
+       
+       public void map(K key, V val,
+                       OutputCollector<K, V> output, Reporter reporter)
+       throws IOException {
+         // Process the <key, value> pair (assume this takes a while)
+         // ...
+         // ...
+         
+         // Let the framework know that we are alive, and kicking!
+         // reporter.progress();
+         
+         // Process some more
+         // ...
+         // ...
+         
+         // Increment the no. of <key, value> pairs processed
+         ++noRecords;
+
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+        
+         // Every 100 records update application-level status
+         if ((noRecords%100) == 0) {
+           reporter.setStatus(mapTaskId + " processed " + noRecords + 
+                              " from input-file: " + inputFile); 
+         }
+         
+         // Output the result
+         output.collect(key, val);
+       }
+     }
+ 

+ +

Applications may write a custom {@link MapRunnable} to exert greater + control on map processing e.g. multi-threaded Mappers etc.

+ + @see JobConf + @see InputFormat + @see Partitioner + @see Reducer + @see MapReduceBase + @see MapRunnable + @see SequenceFile]]> +
+
+ + + + + + + + + + + + + + + + + + + + + Provides default no-op implementations for a few methods, most non-trivial + applications need to override some of them.

]]> +
+
+ + + + + + + + + + + <key, value> pairs. + +

Mapping of input records to output records is complete when this method + returns.

+ + @param input the {@link RecordReader} to read the input records. + @param output the {@link OutputCollector} to collect the outputrecords. + @param reporter {@link Reporter} to report progress, status-updates etc. + @throws IOException]]> +
+
+ + Custom implementations of MapRunnable can exert greater + control on map processing e.g. multi-threaded, asynchronous mappers etc.

+ + @see Mapper]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly + equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)} + to construct RecordReader's for MultiFileSplit's. + @see MultiFileSplit]]> +
+
+ + + + + + + + + + + + + + + + + th Path]]> + + + + + + + + + + + th Path]]> + + + + + + + + + + + + + + + + + + + + + + + MultiFileSplit can be used to implement {@link RecordReader}'s, with + reading one record per file. + @see FileSplit + @see MultiFileInputFormat]]> + + + + + + + + + + + + + + + <key, value> pairs output by {@link Mapper}s + and {@link Reducer}s. + +

OutputCollector is the generalization of the facility + provided by the Map-Reduce framework to collect data output by either the + Mapper or the Reducer i.e. intermediate outputs + or the output of the job.

]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param ignored + @param job job configuration. + @throws IOException when output should not be attempted]]> +
+
+ + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter + @see JobConf]]> +
+
+ + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be paritioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0 to 1.0. + @throws IOException]]> + + + + RecordReader reads <key, value> pairs from an + {@link InputSplit}. + +

RecordReader, typically, converts the byte-oriented view of + the input, provided by the InputSplit, and presents a + record-oriented view for the {@link Mapper} & {@link Reducer} tasks for + processing. It thus assumes the responsibility of processing record + boundaries and presenting the tasks with keys and values.

+ + @see InputSplit + @see InputFormat]]> +
+
+ + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param reporter facility to report progress. + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + Reduces values for a given key. + +

The framework calls this method for each + <key, (list of values)> pair in the grouped inputs. + Output values must be of the same type as input values. Input keys must + not be altered. The framework will reuse the key and value objects + that are passed into the reduce, therefore the application should clone + the objects they want to keep a copy of. In many cases, all values are + combined into zero or one value. +

+ +

Output pairs are collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the key. + @param values the list of values to reduce. + @param output to collect keys and combined values. + @param reporter facility to report progress.]]> +
+ + + The number of Reducers for the job is set by the user via + {@link JobConf#setNumReduceTasks(int)}. Reducer implementations + can access the {@link JobConf} for the job via the + {@link JobConfigurable#configure(JobConf)} method and initialize themselves. + Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    Reducer is input the grouped output of a {@link Mapper}. + In the phase the framework, for each Reducer, fetches the + relevant partition of the output of all the Mappers, via HTTP. +

    +
  2. + +
  3. +

    Sort

    + +

    The framework groups Reducer inputs by keys + (since different Mappers may have output the same key) in this + stage.

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    If equivalence rules for keys while grouping the intermediates are + different from those for grouping keys before reduction, then one may + specify a Comparator via + {@link JobConf#setOutputValueGroupingComparator(Class)}.Since + {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to + control how intermediate keys are grouped, these can be used in conjunction + to simulate secondary sort on values.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterator, OutputCollector, Reporter)} + method is called for each <key, (list of values)> pair in + the grouped inputs.

    +

    The output of the reduce task is typically written to the + {@link FileSystem} via + {@link OutputCollector#collect(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+     public class MyReducer<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Reducer<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+        
+       private String reduceTaskId;
+       private int noKeys = 0;
+       
+       public void configure(JobConf job) {
+         reduceTaskId = job.get("mapred.task.id");
+       }
+       
+       public void reduce(K key, Iterator<V> values,
+                          OutputCollector<K, V> output, 
+                          Reporter reporter)
+       throws IOException {
+       
+         // Process
+         int noValues = 0;
+         while (values.hasNext()) {
+           V value = values.next();
+           
+           // Increment the no. of values for this key
+           ++noValues;
+           
+           // Process the <key, value> pair (assume this takes a while)
+           // ...
+           // ...
+           
+           // Let the framework know that we are alive, and kicking!
+           if ((noValues%10) == 0) {
+             reporter.progress();
+           }
+         
+           // Process some more
+           // ...
+           // ...
+           
+           // Output the <key, value> 
+           output.collect(key, value);
+         }
+         
+         // Increment the no. of <key, list of values> pairs processed
+         ++noKeys;
+         
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+         
+         // Every 100 keys update application-level status
+         if ((noKeys%100) == 0) {
+           reporter.setStatus(reduceTaskId + " processed " + noKeys);
+         }
+       }
+     }
+ 

+ + @see Mapper + @see Partitioner + @see Reporter + @see MapReduceBase]]> +
+
+ + + + + + + + + + + + + + + Enum. + @param amount A non-negative amount by which the counter is to + be incremented.]]> + + + + + + + + + + + + + + InputSplit that the map is reading from. + @throws UnsupportedOperationException if called outside a mapper]]> + + + + + + + + + {@link Mapper} and {@link Reducer} can use the Reporter + provided to report progress or just indicate that they are alive. In + scenarios where the application takes an insignificant amount of time to + process individual key/value pairs, this is crucial since the framework + might assume that the task has timed-out and kill that task. + +

Applications can also update {@link Counters} via the provided + Reporter .

+ + @see Progressable + @see Counters]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RunningJob is the user-interface to query for details on a + running Map-Reduce job. + +

Clients can get hold of RunningJob via the {@link JobClient} + and then query the running-job for details such as name, configuration, + progress etc.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This allows the user to specify the key class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + This allows the user to specify the value class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f. The filtering criteria is + MD5(key) % f == 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + f using + the criteria record# % f == 0. + For example, if the frequency is 10, one out of 10 records is returned.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + @param name The name of the server + @param port The port to use on the server + @param findPort whether the server should start at the given port and + increment by 1 until it finds a free port.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points to the log directory + "/static/" -> points to common static files (src/webapps/static) + "/" -> the jsp server code from (src/webapps/)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all task attempt IDs + of any jobtracker, in any job, of the first + map task, we would use : +
 
+ TaskAttemptID.getTaskAttemptIDsPattern(null, null, true, 1, null);
+ 
+ which will return : +
 "attempt_[^_]*_[0-9]*_m_000001_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @param attemptId the task attempt number, or null + @return a regex pattern matching TaskAttemptIDs]]> +
+
+ + + An example TaskAttemptID is : + attempt_200707121733_0003_m_000005_0 , which represents the + zeroth task attempt for the fifth map task in the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse TaskAttemptID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the first map task + of any jobtracker, of any job, we would use : +

 
+ TaskID.getTaskIDsPattern(null, null, true, 1);
+ 
+ which will return : +
 "task_[^_]*_[0-9]*_m_000001*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @return a regex pattern matching TaskIDs]]> +
+ + + + An example TaskID is : + task_200707121733_0003_m_000005 , which represents the + fifth map task in the third job running at the jobtracker + started at 200707121733. +

+ Applications should never construct or parse TaskID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.log.dir.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the Job was added.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ([,]*) + func ::= tbl(,"") + class ::= @see java.lang.Class#forName(java.lang.String) + path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) + } + Reads expression from the mapred.join.expr property and + user-supplied join types from mapred.join.define.<ident> + types. Paths supplied to tbl are given as input paths to the + InputFormat class listed. + @see #compose(java.lang.String, java.lang.Class, java.lang.String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ,

) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + mapred.join.define.<ident> to a classname. In the expression + mapred.join.expr, the identifier will be assumed to be a + ComposableRecordReader. + mapred.join.keycomparator can be a classname used to compare keys + in the join. + @see JoinRecordReader + @see MultiFilterRecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + capacity children to position + id in the parent reader. + The id of a root CompositeRecordReader is -1 by convention, but relying + on this is not recommended.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + override(S1,S2,S3) will prefer values + from S3 over S2, and values from S2 over S1 for all keys + emitted from all sources.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [,,...,]]]> + + + + + + + out. + TupleWritable format: + {@code + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Map implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured to use this MapRunnable class (using + the JobConf.setMapRunnerClass method) and + the number of thread the thread-pool can use with the + mapred.map.multithreadedrunner.threads property, its default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pairs. Uses + {@link StringTokenizer} to break text into tokens.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + generateKeyValPairs(Object key, Object value); public void + configure(JobConfjob); } + + The package also provides a base class, ValueAggregatorBaseDescriptor, + implementing the above interface. The user can extend the base class and + implement generateKeyValPairs accordingly. + + The primary work of generateKeyValPairs is to emit one or more key/value + pairs based on the input key/value pair. The key in an output key/value pair + encode two pieces of information: aggregation type and aggregation id. The + value will be aggregated onto the aggregation id according the aggregation + type. + + This class offers a function to generate a map/reduce job using Aggregate + framework. The function takes the following parameters: input directory spec + input format (text or sequence file) output directory a file specifying the + user plugin class]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When constructing the instance, if the factory property + contextName.class exists, + its value is taken to be the name of the class to instantiate. Otherwise, + the default is to create an instance of + org.apache.hadoop.metrics.spi.NullContext, which is a + dummy "no-op" context which will cause all metric data to be discarded. + + @param contextName the name of the context + @return the named MetricsContext]]> + + + + + + + + + + + + + + When the instance is constructed, this method checks if the file + hadoop-metrics.properties exists on the class path. If it + exists, it must be in the format defined by java.util.Properties, and all + the properties in the file are set as attributes on the newly created + ContextFactory instance. + + @return the singleton ContextFactory instance]]> + + + + getFactory() method.]]> + + + + + + + + + + + + + + + + + + + startMonitoring() again after calling + this. + @see #close()]]> + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record name identifies the kind of data to be reported. For example, a + program reporting statistics relating to the disks on a computer might use + a record name "diskStats".

+ + A record has zero or more tags. A tag has a name and a value. To + continue the example, the "diskStats" record might use a tag named + "diskName" to identify a particular disk. Sometimes it is useful to have + more than one tag, so there might also be a "diskType" with value "ide" or + "scsi" or whatever.

+ + A record also has zero or more metrics. These are the named + values that are to be reported to the metrics system. In the "diskStats" + example, possible metric names would be "diskPercentFull", "diskPercentBusy", + "kbReadPerSecond", etc.

+ + The general procedure for using a MetricsRecord is to fill in its tag and + metric values, and then call update() to pass the record to the + client library. + Metric data is not immediately sent to the metrics system + each time that update() is called. + An internal table is maintained, identified by the record name. This + table has columns + corresponding to the tag and the metric names, and rows + corresponding to each unique set of tag values. An update + either modifies an existing row in the table, or adds a new row with a set of + tag values that are different from all the other rows. Note that if there + are no tags, then there can be at most one row in the table.

+ + Once a row is added to the table, its data will be sent to the metrics system + on every timer period, whether or not it has been updated since the previous + timer period. If this is inappropriate, for example if metrics were being + reported by some transient object in an application, the remove() + method can be used to remove the row and thus stop the data from being + sent.

+ + Note that the update() method is atomic. This means that it is + safe for different threads to be updating the same metric. More precisely, + it is OK for different threads to call update() on MetricsRecord instances + with the same set of tag names and tag values. Different threads should + not use the same MetricsRecord instance at the same time.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MetricsContext.registerUpdater().]]> + + + + + + + + + + + + + + + + + + + + + + + + + fileName attribute, + if specified. Otherwise the data will be written to standard + output.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is configured by setting ContextFactory attributes which in turn + are usually configured through a properties file. All the attributes are + prefixed by the contextName. For example, the properties file might contain: +

+ myContextName.fileName=/tmp/metrics.log
+ myContextName.period=5
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contextName.tableName. The returned map consists of + those attributes with the contextName and tableName stripped off.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class implements the internal table of metric data, and the timer + on which data is to be sent to the metrics system. Subclasses must + override the abstract emitRecord method in order to transmit + the data.

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + update + and remove().]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hostname or hostname:port. If + the specs string is null, defaults to localhost:defaultPort. + + @return a list of InetSocketAddress objects.]]> + + + + + + + + + + + + + + + + + + + ,name=" + Where the and are the supplied parameters + + @param serviceName + @param nameName + @param theMbean - the MBean to register + @return the named used to register the MBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.rpc.socket.factory.class.<ClassName>. When no + such parameter exists then fall back on the default socket factory as + configured by hadoop.rpc.socket.factory.class.default. If + this default socket factory is not configured, then fall back on the JVM + default socket factory. + + @param conf the configuration + @param clazz the class (usually a {@link VersionedProtocol}) + @return a socket factory]]> + + + + + + hadoop.rpc.socket.factory.default + + @param conf the configuration + @return the default socket factory as specified in the configuration or + the JVM default socket factory if the configuration does not + contain a default socket factory property.]]> + + + + + + + + + + + + + : + ://:/]]> + + + + + + + + : + ://:/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketInputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getInputStream()} is returned. In the later + case, the timeout argument is ignored and the timeout set with + {@link Socket#setSoTimeout(int)} applies for reads.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see #getInputStream(Socket, long) + + @param socket + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + +
+ + From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketOutputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getOutputStream()} is returned. In the later + case, the timeout argument is ignored and the write will wait until + data is available.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see #getOutputStream(Socket, long) + + @param socket + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+
+ + + + + + + + + + + + + + + + + + + + + node + + @param node + a node + @return true if node is already in the tree; false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scope + if scope starts with ~, choose one from the all nodes except for the + ones in scope; otherwise, choose one from scope + @param scope range of nodes from which a node will be choosen + @return the choosen node]]> + + + + + + + scope but not in excludedNodes + if scope starts with ~, return the number of nodes that are not + in scope and excludedNodes; + @param scope a path string that may start with ~ + @param excludedNodes a list of nodes + @return number of available nodes]]> + + + + + + + + + + + + reader + It linearly scans the array, if a local node is found, swap it with + the first element of the array. + If a local rack node is found, swap it with the first element following + the local node. + If neither local node or local rack node is found, put a random replica + location at postion 0. + It leaves the rest nodes untouched.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + +
+ + Create a new ouput stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = getCount(). + @param newCapacity The new capacity in bytes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index idx = startVector(...); + while (!idx.done()) { + .... // read element of a vector + idx.incr(); + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This task takes the given record definition files and compiles them into + java or c++ + files. It is then up to the user to compile the generated files. + +

The task requires the file or the nested fileset element to be + specified. Optional attributes are language (set the output + language, default is "java"), + destdir (name of the destination directory for generated java/c++ + code, default is ".") and failonerror (specifies error handling + behavior. default is true). +

Usage

+
+ <recordcc
+       destdir="${basedir}/gensrc"
+       language="java">
+   <fileset include="**\/*.jr" />
+ </recordcc>
+ 
]]> +
+
+ +
+ + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi as a comma separated string in + conf as a property attr + + The String starts with the user name followed by the default group names, + and other group names. + + @param conf configuration + @param attr property name + @param ugi a UnixUserGroupInformation]]> + + + + + + + + conf + + The object is expected to store with the property name attr + as a comma separated string that starts + with the user name followed by group names. + If the property name is not defined, return null. + It's assumed that there is only one UGI per user. If this user already + has a UGI in the ugi map, return the ugi in the map. + Otherwise, construct a UGI from the configuration, store it in the + ugi map and return it. + + @param conf configuration + @param attr property name + @return a UnixUGI + @throws LoginException if the stored string is ill-formatted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool supports archiving and anaylzing (sort/grep) of log-files. + It takes as input + a) Input uri which will serve uris of the logs to be archived. + b) Output directory (not mandatory). + b) Directory on dfs to archive the logs. + c) The sort/grep patterns for analyzing the files and separator for boundaries. + Usage: + Logalyzer -archive -archiveDir -analysis -logs -grep -sort -separator +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GenericOptionsParser to parse only the generic Hadoop + arguments. + + The array of string arguments other than the generic arguments can be + obtained by {@link #getRemainingArgs()}. + + @param conf the Configuration to modify. + @param args command-line arguments.]]> + + + + + GenericOptionsParser to parse given options as well + as generic Hadoop options. + + The resulting CommandLine object can be obtained by + {@link #getCommandLine()}. + + @param conf the configuration to modify + @param options options built by the caller + @param args User-specified arguments]]> + + + + + Strings containing the un-parsed arguments + or empty array if commandLine was not defined.]]> + + + + + CommandLine object + to process the parsed arguments. + + Note: If the object is created with + {@link #GenericOptionsParser(Configuration, String[])}, then returned + object will only contain parsed generic options. + + @return CommandLine representing list of arguments + parsed against Options descriptor.]]> + + + + + + + + + + GenericOptionsParser is a utility to parse command line + arguments generic to the Hadoop framework. + + GenericOptionsParser recognizes several standarad command + line arguments, enabling applications to easily specify a namenode, a + jobtracker, additional configuration resources etc. + +

Generic Options

+ +

The supported generic options are:

+

+     -conf <configuration file>     specify a configuration file
+     -D <property=value>            use value for given property
+     -fs <local|namenode:port>      specify a namenode
+     -jt <local|jobtracker:port>    specify a job tracker
+     -files <comma separated list of files>    specify comma separated
+                            files to be copied to the map reduce cluster
+     -libjars <comma separated list of jars>   specify comma separated
+                            jar files to include in the classpath.
+     -archives <comma separated list of archives>    specify comma
+             separated archives to be unarchived on the compute machines.
+
+ 

+ +

The general command line syntax is:

+

+ bin/hadoop command [genericOptions] [commandOptions]
+ 

+ +

Generic command line arguments might modify + Configuration objects, given to constructors.

+ +

The functionality is implemented using Commons CLI.

+ +

Examples:

+

+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+ 
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+     
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+     
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+ 
+ $ bin/hadoop jar -libjars testlib.jar 
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+ 

+ + @see Tool + @see ToolRunner]]> +
+
+ + + + + + + + + Class<T>) of the + argument of type T. + @param The type of the argument + @param t the object to get it class + @return Class<T>]]> + + + + + + + List<T> to a an array of + T[]. + @param c the Class object of the items in the list + @param list the list to convert]]> + + + + + + List<T> to a an array of + T[]. + @param list the list to convert + @throws ArrayIndexOutOfBoundsException if the list is empty. + Use {@link #toArray(Class, List)} if the list may be empty.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-hadoop is loaded, + else false]]> + + + + + + true if native hadoop libraries, if present, can be + used for this job; false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { pq.top().change(); pq.adjustTop(); } + instead of
+  { o = pq.pop(); o.change(); pq.push(o); }
+ 
]]> +
+
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clients and/or applications can use the provided Progressable + to explicitly report progress to the Hadoop framework. This is especially + important for operations which take an insignificant amount of time since, + in-lieu of the reported progress, the framework has to assume that an error + has occured and time-out the operation.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Class is to be obtained + @return the correctly typed Class of the given object.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop Pipes + or Hadoop Streaming. + + It also checks to ensure that we are running on a *nix platform else + (e.g. in Cygwin/Windows) it returns null. + @param job job configuration + @return a String[] with the ulimit command arguments or + null if we are running on a non *nix platform or + if the limit is unspecified.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell interface. + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell can be used to run unix commands like du or + df. It also offers facilities to gate commands by + time-intervals.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ShellCommandExecutorshould be used in cases where the output + of the command needs no explicit parsing and where the command, working + directory and the environment remains unchanged. The output of the command + is stored as-is and is expected to be small.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ArrayList of string values]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the char to be escaped + @return an escaped string]]> + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the escaped char + @return an unescaped string]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tool, is the standard for any Map-Reduce tool/application. + The tool/application should delegate the handling of + + standard command-line options to {@link ToolRunner#run(Tool, String[])} + and only handle its custom arguments.

+ +

Here is how a typical Tool is implemented:

+

+     public class MyApp extends Configured implements Tool {
+     
+       public int run(String[] args) throws Exception {
+         // Configuration processed by ToolRunner
+         Configuration conf = getConf();
+         
+         // Create a JobConf using the processed conf
+         JobConf job = new JobConf(conf, MyApp.class);
+         
+         // Process custom command-line options
+         Path in = new Path(args[1]);
+         Path out = new Path(args[2]);
+         
+         // Specify various job-specific parameters     
+         job.setJobName("my-app");
+         job.setInputPath(in);
+         job.setOutputPath(out);
+         job.setMapperClass(MyApp.MyMapper.class);
+         job.setReducerClass(MyApp.MyReducer.class);
+
+         // Submit the job, then poll for progress until the job is complete
+         JobClient.runJob(job);
+       }
+       
+       public static void main(String[] args) throws Exception {
+         // Let ToolRunner handle generic command-line options 
+         int res = ToolRunner.run(new Configuration(), new Sort(), args);
+         
+         System.exit(res);
+       }
+     }
+ 

+ + @see GenericOptionsParser + @see ToolRunner]]> +
+
+ + + + + + + + + + + + Tool by {@link Tool#run(String[])}, after + parsing with the given generic arguments. Uses the given + Configuration, or builds one if null. + + Sets the Tool's configuration with the possibly modified + version of the conf. + + @param conf Configuration for the Tool. + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + Tool with its Configuration. + + Equivalent to run(tool.getConf(), tool, args). + + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + + + ToolRunner can be used to run classes implementing + Tool interface. It works in conjunction with + {@link GenericOptionsParser} to parse the + + generic hadoop command line arguments and modifies the + Configuration of the Tool. The + application-specific options are passed along without being modified. +

+ + @see Tool + @see GenericOptionsParser]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
diff --git a/lib/jdiff/hadoop_0.18.2.xml b/lib/jdiff/hadoop_0.18.2.xml new file mode 100644 index 00000000000..08173ab82dc --- /dev/null +++ b/lib/jdiff/hadoop_0.18.2.xml @@ -0,0 +1,38788 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + final. + + @param name resource to be added, the classpath is examined for a file + with that name.]]> + + + + + + final. + + @param url url of the resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param file file-path of resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + name property, null if + no such property exists. + + Values are processed for variable expansion + before being returned. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + name property, without doing + variable expansion. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + + value of the name property. + + @param name property name. + @param value property value.]]> + + + + + + + name property. If no such property + exists, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value, or defaultValue if the property + doesn't exist.]]> + + + + + + + name property as an int. + + If no such property exists, or if the specified value is not a valid + int, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as an int, + or defaultValue.]]> + + + + + + + name property to an int. + + @param name property name. + @param value int value of the property.]]> + + + + + + + name property as a long. + If no such property is specified, or if the specified value is not a valid + long, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a long, + or defaultValue.]]> + + + + + + + name property to a long. + + @param name property name. + @param value long value of the property.]]> + + + + + + + name property as a float. + If no such property is specified, or if the specified value is not a valid + float, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a float, + or defaultValue.]]> + + + + + + + name property as a boolean. + If no such property is specified, or if the specified value is not a valid + boolean, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a boolean, + or defaultValue.]]> + + + + + + + name property to a boolean. + + @param name property name. + @param value boolean value of the property.]]> + + + + + + + + + + + + + name property as + a collection of Strings. + If no such property is specified then empty collection is returned. +

+ This is an optimized version of {@link #getStrings(String)} + + @param name property name. + @return property value as a collection of Strings.]]> + + + + + + name property as + an array of Strings. + If no such property is specified then null is returned. + + @param name property name. + @return property value as an array of Strings, + or null.]]> + + + + + + + name property as + an array of Strings. + If no such property is specified then default value is returned. + + @param name property name. + @param defaultValue The default value + @return property value as an array of Strings, + or default value.]]> + + + + + + + name property as + as comma delimited values. + + @param name property name. + @param values The values]]> + + + + + + + + + + + + + + name property as a Class. + If no such property is specified, then defaultValue is + returned. + + @param name the class name. + @param defaultValue default value. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property as a Class + implementing the interface specified by xface. + + If no such property is specified, then defaultValue is + returned. + + An exception is thrown if the returned class does not implement the named + interface. + + @param name the class name. + @param defaultValue default value. + @param xface the interface implemented by the named class. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property to the name of a + theClass implementing the given interface xface. + + An exception is thrown if theClass does not implement the + interface xface. + + @param name property name. + @param theClass property value. + @param xface the interface implemented by the named class.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + + + + + name. + + @param name configuration resource name. + @return an input stream attached to the resource.]]> + + + + + + name. + + @param name configuration resource name. + @return a reader attached to the resource.]]> + + + + + String + key-value pairs in the configuration. + + @return an iterator over the entries.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + true to set quiet-mode on, false + to turn it off.]]> + + + + + + + + + + + Resources + +

Configurations are specified by resources. A resource contains a set of + name/value pairs as XML data. Each resource is named by either a + String or by a {@link Path}. If named by a String, + then the classpath is examined for a file with that name. If named by a + Path, then the local filesystem is examined directly, without + referring to the classpath. + +

Hadoop by default specifies two resources, loaded in-order from the + classpath:

    +
  1. hadoop-default.xml + : Read-only defaults for hadoop.
  2. +
  3. hadoop-site.xml: Site-specific configuration for a given hadoop + installation.
  4. +
+ Applications may add additional resources, which are loaded + subsequent to these resources in the order they are added. + +

Final Parameters

+ +

Configuration parameters may be declared final. + Once a resource declares a value final, no subsequently-loaded + resource can alter that value. + For example, one might define a final parameter with: +

+  <property>
+    <name>dfs.client.buffer.dir</name>
+    <value>/tmp/hadoop/dfs/client</value>
+    <final>true</final>
+  </property>
+ + Administrators typically define parameters as final in + hadoop-site.xml for values that user applications may not alter. + +

Variable Expansion

+ +

Value strings are first processed for variable expansion. The + available properties are:

    +
  1. Other properties defined in this Configuration; and, if a name is + undefined here,
  2. +
  3. Properties in {@link System#getProperties()}.
  4. +
+ +

For example, if a configuration resource contains the following property + definitions: +

+  <property>
+    <name>basedir</name>
+    <value>/user/${user.name}</value>
+  </property>
+  
+  <property>
+    <name>tempdir</name>
+    <value>${basedir}/tmp</value>
+  </property>
+ + When conf.get("tempdir") is called, then ${basedir} + will be resolved to another property in this Configuration, while + ${user.name} would then ordinarily be resolved to the value + of the System property with that name.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DistributedCache is a facility provided by the Map-Reduce + framework to cache files (text, archives, jars etc.) needed by applications. +

+ +

Applications specify the files, via urls (hdfs:// or http://) to be cached + via the {@link JobConf}. The DistributedCache assumes that the + files specified via hdfs:// urls are already present on the + {@link FileSystem} at the path specified by the url.

+ +

The framework will copy the necessary files on to the slave node before + any tasks for the job are executed on that node. Its efficiency stems from + the fact that the files are only copied once per job and the ability to + cache archives which are un-archived on the slaves.

+ +

DistributedCache can be used to distribute simple, read-only + data/text files and/or more complex types such as archives, jars etc. + Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. + Jars may be optionally added to the classpath of the tasks, a rudimentary + software distribution mechanism. Files have execution permissions. + Optionally users can also direct it to symlink the distributed cache file(s) + into the working directory of the task.

+ +

DistributedCache tracks modification timestamps of the cache + files. Clearly the cache files should not be modified by the application + or externally while the job is executing.

+ +

Here is an illustrative example on how to use the + DistributedCache:

+

+     // Setting up the cache for the application
+     
+     1. Copy the requisite files to the FileSystem:
+     
+     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
+     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
+     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+     
+     2. Setup the application's JobConf:
+     
+     JobConf job = new JobConf();
+     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
+                                   job);
+     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+     
+     3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+     
+     public static class MapClass extends MapReduceBase  
+     implements Mapper<K, V, K, V> {
+     
+       private Path[] localArchives;
+       private Path[] localFiles;
+       
+       public void configure(JobConf job) {
+         // Get the cached archives/files
+         localArchives = DistributedCache.getLocalCacheArchives(job);
+         localFiles = DistributedCache.getLocalCacheFiles(job);
+       }
+       
+       public void map(K key, V value, 
+                       OutputCollector<K, V> output, Reporter reporter) 
+       throws IOException {
+         // Use data from the cached archives/files here
+         // ...
+         // ...
+         output.collect(k, v);
+       }
+     }
+     
+ 

+ + @see JobConf + @see JobClient]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BufferedFSInputStream + with the specified buffer size, + and saves its argument, the input stream + in, for later use. An internal + buffer array of length size + is created and stored in buf. + + @param in the underlying input stream. + @param size the buffer size. + @exception IllegalArgumentException if size <= 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + setReplication of FileSystem + @param src file name + @param replication new replication + @throws IOException + @return true if successful; + false if file does not exist or is a directory]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ']]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fs.scheme.class whose value names the FileSystem class. + The entire URI is passed to the FileSystem instance's initialize method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Return all the files that match filePattern and are not checksum + files. Results are sorted by their names. + +

+ A filename pattern is composed of regular characters and + special pattern matching characters, which are: + +

+
+
+

+

? +
Matches any single character. + +

+

* +
Matches zero or more characters. + +

+

[abc] +
Matches a single character from character set + {a,b,c}. + +

+

[a-b] +
Matches a single character from the character range + {a...b}. Note that character a must be + lexicographically less than or equal to character b. + +

+

[^a] +
Matches a single character that is not from character set or range + {a}. Note that the ^ character must occur + immediately to the right of the opening bracket. + +

+

\c +
Removes (escapes) any special meaning of character c. + +

+

{ab,cd} +
Matches a string from the string set {ab, cd} + +

+

{ab,c{de,fh}} +
Matches a string from the string set {ab, cde, cfh} + +
+
+
+ + @param pathPattern a regular expression specifying a pth pattern + + @return an array of paths that match the path pattern + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All user code that may potentially use the Hadoop Distributed + File System should be written to use a FileSystem object. The + Hadoop DFS is a multi-machine system that appears as a single + disk. It's useful because of its fault tolerance and potentially + very large capacity. + +

+ The local implementation is {@link LocalFileSystem} and distributed + implementation is {@link DistributedFileSystem}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FilterFileSystem contains + some other file system, which it uses as + its basic file system, possibly transforming + the data along the way or providing additional + functionality. The class FilterFileSystem + itself simply overrides all methods of + FileSystem with versions that + pass all requests to the contained file + system. Subclasses of FilterFileSystem + may further override some of these methods + and may also provide additional methods + and fields.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + buf at offset + and checksum into checksum. + The method is used for implementing read, therefore, it should be optimized + for sequential reading + @param pos chunkPos + @param buf desitination buffer + @param offset offset in buf at which to store data + @param len maximun number of bytes to read + @return number of bytes read]]> + + + + + + + + + + + + + + + + + -1 if the end of the + stream is reached. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + This method implements the general contract of the corresponding + {@link InputStream#read(byte[], int, int) read} method of + the {@link InputStream} class. As an additional + convenience, it attempts to read as many bytes as possible by repeatedly + invoking the read method of the underlying stream. This + iterated read continues until one of the following + conditions becomes true:

    + +
  • The specified number of bytes have been read, + +
  • The read method of the underlying stream returns + -1, indicating end-of-file. + +
If the first read on the underlying stream returns + -1 to indicate end-of-file then this method returns + -1. Otherwise this method returns the number of bytes + actually read. + + @param b destination buffer. + @param off offset at which to start storing bytes. + @param len maximum number of bytes to read. + @return the number of bytes read, or -1 if the end of + the stream has been reached. + @exception IOException if an I/O error occurs. + ChecksumException if any checksum error occurs]]> +
+ + + + + + + + + + + + n bytes of data from the + input stream. + +

This method may skip more bytes than are remaining in the backing + file. This produces no exception and the number of bytes skipped + may include some number of bytes that were beyond the EOF of the + backing file. Attempting to read from the stream after skipping past + the end will result in -1 indicating the end of the file. + +

If n is negative, no bytes are skipped. + + @param n the number of bytes to be skipped. + @return the actual number of bytes skipped. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to skip to is corrupted]]> + + + + + + + This method may seek past the end of the file. + This produces no exception and an attempt to read from + the stream will result in -1 indicating the end of the file. + + @param pos the postion to seek to. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to seek to is corrupted]]> + + + + + + + + + + len bytes from + stm + + @param stm an input stream + @param buf destiniation buffer + @param offset offset at which to store data + @param len number of bytes to read + @return actual number of bytes read + @throws IOException if there is any IO error]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + len bytes from the specified byte array + starting at offset off and generate a checksum for + each data chunk. + +

This method stores bytes from the given array into this + stream's buffer before it gets checksumed. The buffer gets checksumed + and flushed to the underlying output stream when all data + in a checksum chunk are in the buffer. If the buffer is empty and + requested length is at least as large as the size of next checksum chunk + size, this method will checksum and write the chunk directly + to the underlying output stream. Thus it avoids uneccessary data copy. + + @param b the data. + @param off the start offset in the data. + @param len the number of bytes to write. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if and only if pathname + should be included]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + trash feature. Files are moved to a user's trash + directory, a subdirectory of their home directory named ".Trash". Files are + initially moved to a current sub-directory of the trash directory. + Within that sub-directory their original path is preserved. Periodically + one may checkpoint the current trash and remove older checkpoints. (This + design permits trash management without enumeration of the full trash + content, without date support in the filesystem, and without clock + synchronization.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} backed by an FTP client provided by Apache Commons Net. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is a tool for migrating data from an older to a newer version + of an S3 filesystem. +

+

+ All files in the filesystem are migrated by re-writing the block metadata + - no datafiles are touched. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + Extracts AWS credentials from the filesystem URI or configuration. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A block-based {@link FileSystem} backed by + Amazon S3. +

+ @see NativeS3FileSystem]]> +
+
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If f is a file, this method will make a single call to S3. + If f is a directory, this method will make a maximum of + (n / 1000) + 2 calls to S3, where n is the total number of + files and directories contained directly in f. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} for reading and writing files stored on + Amazon S3. + Unlike {@link org.apache.hadoop.fs.s3.S3FileSystem} this implementation + stores files on S3 in their + native form so they can be read by other S3 tools. +

+ @see org.apache.hadoop.fs.s3.S3FileSystem]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nth value.]]> + + + + + + + + + + + + + + + + + + + + + nth value in the file.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + public class IntArrayWritable extends ArrayWritable { + public IntArrayWritable() { + super(IntWritable.class); + } + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a ByteWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataInputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using DataInput methods ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataOutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using DataOutput methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + the class of the item + @param conf the configuration to store + @param item the object to be stored + @param keyName the name of the key to use + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param items the objects to be stored + @param keyName the name of the key to use + @throws IndexOutOfBoundsException if the items array is empty + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + DefaultStringifier offers convenience methods to store/load objects to/from + the configuration. + + @param the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a DoubleWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a FloatWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When two sequence files, which have same Key type but different Value + types, are mapped out to reduce, multiple Value types is not allowed. + In this case, this class can help you wrap instances with different types. +

+ +

+ Compared with ObjectWritable, this class is much more effective, + because ObjectWritable will append the class declaration as a String + into the output file in every Key-Value pair. +

+ +

+ Generic Writable implements {@link Configurable} interface, so that it will be + configured by the framework. The configuration is passed to the wrapped objects + implementing {@link Configurable} interface before deserialization. +

+ + how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines + the classes which will be wrapped in GenericObject in application. + Attention: this classes defined in getTypes() method, must + implement Writable interface. +

+ + The code looks like this: +
+ public class GenericObject extends GenericWritable {
+ 
+   private static Class[] CLASSES = {
+               ClassType1.class, 
+               ClassType2.class,
+               ClassType3.class,
+               };
+
+   protected Class[] getTypes() {
+       return CLASSES;
+   }
+
+ }
+ 
+ + @since Nov 8, 2006]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new InputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using InputStream methods ...
+ }
+ 
+ @see DataInputBuffer + @see DataOutput]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a IntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + closes the input and output streams + at the end. + @param in InputStrem to read from + @param out OutputStream to write to + @param conf the Configuration object]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ignore any {@link IOException} or + null pointers. Must only be used for cleanup in exception handlers. + @param log the log to record problems to at debug level. Can be null. + @param closeables the objects to close]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a LongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A map is a directory containing two files, the data file, + containing all keys and values in the map, and a smaller index + file, containing a fraction of the keys. The fraction is determined by + {@link Writer#getIndexInterval()}. + +

The index file is read entirely into memory. Thus key implementations + should try to keep themselves small. + +

Map files are created by adding entries in-order. To maintain a large + database, perform updates by copying the previous version of a database and + merging in a sorted change list, to create a new version of the database in + a new file. Sorting large change lists can be done with {@link + SequenceFile.Sorter}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key and + val. Returns true if such a pair exists and false when at + the end of the map]]> + + + + + + + + + + + + + + + + key or if it does not exist, at the first entry + after the named key. + +- * @param key - key that we're trying to find +- * @param val - data value if key is found +- * @return - the key that was the closest match or null if eof.]]> + + + + + + + + + key does not exist, return + the first entry that falls just before the key. Otherwise, + return the record that sorts just after. + @return - the key that was the closest match or null if eof.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is an MD5Hash whose digest contains the + same values.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new OutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ OutputBuffer buffer = new OutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using OutputStream methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
+ @see DataOutputBuffer + @see InputBuffer]]> +
+
+ + + + + + + + + + + + + + + A {@link Comparator} that operates directly on byte representations of + objects. +

+ @param + @see DeserializerComparator]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SequenceFiles are flat files consisting of binary key/value + pairs. + +

SequenceFile provides {@link Writer}, {@link Reader} and + {@link Sorter} classes for writing, reading and sorting respectively.

+ + There are three SequenceFile Writers based on the + {@link CompressionType} used to compress key/value pairs: +
    +
  1. + Writer : Uncompressed records. +
  2. +
  3. + RecordCompressWriter : Record-compressed files, only compress + values. +
  4. +
  5. + BlockCompressWriter : Block-compressed files, both keys & + values are collected in 'blocks' + separately and compressed. The size of + the 'block' is configurable. +
+ +

The actual compression algorithm used to compress key and/or values can be + specified by using the appropriate {@link CompressionCodec}.

+ +

The recommended way is to use the static createWriter methods + provided by the SequenceFile to chose the preferred format.

+ +

The {@link Reader} acts as the bridge and can read any of the above + SequenceFile formats.

+ +

SequenceFile Formats

+ +

Essentially there are 3 different formats for SequenceFiles + depending on the CompressionType specified. All of them share a + common header described below. + +

+
    +
  • + version - 3 bytes of magic header SEQ, followed by 1 byte of actual + version number (e.g. SEQ4 or SEQ6) +
  • +
  • + keyClassName -key class +
  • +
  • + valueClassName - value class +
  • +
  • + compression - A boolean which specifies if compression is turned on for + keys/values in this file. +
  • +
  • + blockCompression - A boolean which specifies if block-compression is + turned on for keys/values in this file. +
  • +
  • + compression codec - CompressionCodec class which is used for + compression of keys and/or values (if compression is + enabled). +
  • +
  • + metadata - {@link Metadata} for this file. +
  • +
  • + sync - A sync marker to denote end of the header. +
  • +
+ +
Uncompressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Record-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Compressed Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Block-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record Block +
      +
    • Compressed key-lengths block-size
    • +
    • Compressed key-lengths block
    • +
    • Compressed keys block-size
    • +
    • Compressed keys block
    • +
    • Compressed value-lengths block-size
    • +
    • Compressed value-lengths block
    • +
    • Compressed values block-size
    • +
    • Compressed values block
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +

The compressed blocks of key lengths and value lengths consist of the + actual lengths of individual keys/values encoded in ZeroCompressedInteger + format.

+ + @see CompressionCodec]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key, skipping its + value. True if another entry exists, and false at end of file.]]> + + + + + + + + key and + val. Returns true if such a pair exists and false when at + end of file]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The position passed must be a position returned by {@link + SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary + position, use {@link SequenceFile.Reader#sync(long)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SegmentDescriptor + @param segments the list of SegmentDescriptors + @param tmpDir the directory to write temporary files into + @return RawKeyValueIterator + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For best performance, applications should make sure that the {@link + Writable#readFields(DataInput)} implementation of their keys is + very efficient. In particular, it should avoid allocating memory.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This always returns a synchronized position. In other words, + immediately after calling {@link SequenceFile.Reader#seek(long)} with a position + returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However + the key may be earlier in the file than key last written when this + method was called (e.g., with block-compression, it may be the first key + in the block that was being written when this method was called).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key. Returns + true if such a key exists and false when at the end of the set.]]> + + + + + + + key. + Returns key, or null if no match exists.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + position. Note that this + method avoids using the converter or doing String instatiation + @return the Unicode scalar value at position or -1 + if the position is invalid or points to a + trailing byte]]> + + + + + + + + + + what in the backing + buffer, starting as position start. The starting + position is measured in bytes and the return value is in + terms of byte position in the buffer. The backing buffer is + not converted to a string for this operation. + @return byte position of the first occurence of the search + string in the UTF-8 buffer or -1 if not found]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a Text with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException.]]> + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException. + @return ByteBuffer: bytes stores at ByteBuffer.array() + and length is ByteBuffer.limit()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In + addition, it provides methods for string traversal without converting the + byte array to a string.

Also includes utilities for + serializing/deserialing a string, coding/decoding a string, checking if a + byte array contains valid UTF8 code, calculating the length of an encoded + string.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a UTF8 with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + Also includes utilities for efficiently reading and writing UTF-8. + + @deprecated replaced by Text]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This is useful when a class may evolve, so that instances written by the + old version of the class may still be processed by the new version. To + handle this situation, {@link #readFields(DataInput)} + implementations should catch {@link VersionMismatchException}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VIntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VLongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + out. + + @param out DataOuput to serialize this object into. + @throws IOException]]> + + + + + + + in. + +

For efficiency, implementations should attempt to re-use storage in the + existing object where possible.

+ + @param in DataInput to deseriablize this object from. + @throws IOException]]> +
+ + + Any key or value type in the Hadoop Map-Reduce + framework implements this interface.

+ +

Implementations typically implement a static read(DataInput) + method which constructs a new instance, calls {@link #readFields(DataInput)} + and returns the instance.

+ +

Example:

+

+     public class MyWritable implements Writable {
+       // Some data     
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public static MyWritable read(DataInput in) throws IOException {
+         MyWritable w = new MyWritable();
+         w.readFields(in);
+         return w;
+       }
+     }
+ 

]]> +
+ + + + + + + + WritableComparables can be compared to each other, typically + via Comparators. Any type which is to be used as a + key in the Hadoop Map-Reduce framework should implement this + interface.

+ +

Example:

+

+     public class MyWritableComparable implements WritableComparable {
+       // Some data
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public int compareTo(MyWritableComparable w) {
+         int thisValue = this.value;
+         int thatValue = ((IntWritable)o).value;
+         return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+       }
+     }
+ 

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The default implementation reads the data into two {@link + WritableComparable}s (using {@link + Writable#readFields(DataInput)}, then calls {@link + #compare(WritableComparable,WritableComparable)}.]]> + + + + + + + The default implementation uses the natural ordering, calling {@link + Comparable#compareTo(Object)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This base implemenation uses the natural ordering. To define alternate + orderings, override {@link #compare(WritableComparable,WritableComparable)}. + +

One may optimize compare-intensive operations by overriding + {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are + provided to assist in optimized implementations of this method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum type + @param in DataInput to read from + @param enumType Class type of Enum + @return Enum represented by String read from DataInput + @throws IOException]]> + + + + + + + + + + + + + + + + len number of bytes in input streamin + @param in input stream + @param len number of bytes to skip + @throws IOException when skipped less number of bytes]]> + + + + + + + + + + + + + + CompressionCodec for which to get the + Compressor + @return Compressor for the given + CompressionCodec from the pool or a new one]]> + + + + + + CompressionCodec for which to get the + Decompressor + @return Decompressor for the given + CompressionCodec the pool or a new one]]> + + + + + + Compressor to be returned to the pool]]> + + + + + + Decompressor to be returned to the + pool]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Implementations are assumed to be buffered. This permits clients to + reposition the underlying input stream then call {@link #resetState()}, + without having to also synchronize client buffers.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + true if a preset dictionary is needed for decompression. + @return true if a preset dictionary is needed for decompression]]> + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-lzo library is loaded & initialized; + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair. + http://www.oberhumer.com/opensource/lzo/]]> + + + + + + + + + + + + + + + + + + + + + true if lzo compressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if lzo decompressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-zlib is loaded & initialized + and can be loaded for this job, else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keep trying a limited number of times, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+ + + + + + + + Keep trying for a maximum time, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by the number of tries so far. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by a random + number in the range of [0, 2 to the number of retries) +

]]> +
+
+ + + + + + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + + + A retry policy for RemoteException + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + Try once, and fail by re-throwing the exception. + This corresponds to having no retry mechanism in place. +

]]> +
+
+ + + + Try once, and fail silently for void methods, or by + re-throwing the exception for non-void methods. +

]]> +
+
+ + + + Keep trying forever. +

]]> +
+
+ + + A collection of useful implementations of {@link RetryPolicy}. +

]]> +
+
+ + + + + + + + + + Determines whether the framework should retry a + method for the given exception, and the number + of retries that have been made for that operation + so far. +

+ @param e The exception that caused the method to fail. + @param retries The number of times the method has been retried. + @return true if the method should be retried, + false if the method should not be retried + but shouldn't fail with an exception (only for void methods). + @throws Exception The re-thrown exception e indicating + that the method failed and should not be retried further.]]> +
+
+ + + Specifies a policy for retrying method failures. + Implementations of this interface should be immutable. +

]]> +
+
+ + + + + + + + + + + + Create a proxy for an interface of an implementation class + using the same retry policy for each method in the interface. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param retryPolicy the policy for retirying method call failures + @return the retry proxy]]> +
+
+ + + + + + + Create a proxy for an interface of an implementation class + using the a set of retry policies specified by method name. + If no retry policy is defined for a method then a default of + {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param methodNameToPolicyMap a map of method names to retry policies + @return the retry proxy]]> +
+
+ + + A factory for creating retry proxies. +

]]> +
+
+ +
+ + + + + + + + Prepare the deserializer for reading.

]]> +
+
+ + + + + + Deserialize the next object from the underlying input stream. + If the object t is non-null then this deserializer + may set its internal state to the next object read from the input + stream. Otherwise, if the object t is null a new + deserialized object will be created. +

+ @return the deserialized object]]> +
+
+ + + + Close the underlying input stream and clear up any resources.

]]> +
+
+ + + Provides a facility for deserializing objects of type from an + {@link InputStream}. +

+ +

+ Deserializers are stateful, but must not buffer the input since + other producers may read from the input between calls to + {@link #deserialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + A {@link RawComparator} that uses a {@link Deserializer} to deserialize + the objects to be compared so that the standard {@link Comparator} can + be used to compare them. +

+

+ One may optimize compare-intensive operations by using a custom + implementation of {@link RawComparator} that operates directly + on byte representations. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + An experimental {@link Serialization} for Java {@link Serializable} classes. +

+ @see JavaSerializationComparator]]> +
+
+ + + + + + + + + + + + + A {@link RawComparator} that uses a {@link JavaSerialization} + {@link Deserializer} to deserialize objects that are then compared via + their {@link Comparable} interfaces. +

+ @param + @see JavaSerialization]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + Encapsulates a {@link Serializer}/{@link Deserializer} pair. +

+ @param ]]> +
+
+ + + + + + + Serializations are found by reading the io.serializations + property from conf, which is a comma-delimited list of + classnames. +

]]> +
+
+ + + + + + + + + + + + A factory for {@link Serialization}s. +

]]> +
+
+ + + + + + + + Prepare the serializer for writing.

]]> +
+
+ + + + + Serialize t to the underlying output stream.

]]> +
+
+ + + + Close the underlying output stream and clear up any resources.

]]> +
+
+ + + Provides a facility for serializing objects of type to an + {@link OutputStream}. +

+ +

+ Serializers are stateful, but must not buffer the output since + other producers may write to the output between calls to + {@link #serialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param, to the IPC server running at + address, returning the value. Throws exceptions if there are + network problems or if the remote code threw an exception.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unwraps any IOException. + + @param lookupTypes the desired exception class. + @return IOException, which is either the lookupClass exception or this.]]> + + + + + This unwraps any Throwable that has a constructor taking + a String as a parameter. + Otherwise it returns this. + + @return Throwable]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + protocol is a Java interface. All parameters and return types must + be one of: + +
  • a primitive type, boolean, byte, + char, short, int, long, + float, double, or void; or
  • + +
  • a {@link String}; or
  • + +
  • a {@link Writable}; or
  • + +
  • an array of the above types
+ + All methods in the protocol should throw only IOException. No field data of + the protocol instance is transmitted.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + handlerCount determines + the number of handler threads that will be used to process calls.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #rpcQueueTime}.inc(time)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker, + as {@link JobTracker.State} + + @return the current state of the JobTracker.]]> + + + + + + + + + + + + ClusterStatus provides clients with information such as: +

    +
  1. + Size of the cluster. +
  2. +
  3. + Task capacity of the cluster. +
  4. +
  5. + The number of currently running map & reduce tasks. +
  6. +
  7. + State of the JobTracker. +
  8. +

+ +

Clients can query for the latest ClusterStatus, via + {@link JobClient#getClusterStatus()}.

+ + @see JobClient]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter can be of + any {@link Enum} type.

+ +

Counters are bunched into {@link Group}s, each comprising of + counters from a particular Enum class.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of counters, comprising of counters from a particular + counter {@link Enum} class. + +

Grouphandles localization of the class name and the + counter names.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param fs the file system that the file is on + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides a generic implementation of + {@link #getSplits(JobConf, int)}. + Subclasses of FileInputFormat can also override the + {@link #isSplitable(FileSystem, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s.]]> + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the attemptid, say + attempt_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in ${mapred.work.output.dir} during execution + of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the + framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

Note: the value of ${mapred.work.output.dir} during + execution of a particular task-attempt is actually + ${mapred.output.dir}/_temporary/_{$taskid}, and this value is + set by the map-reduce framework. So, just create any side-files in the + path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce + task to take advantage of this feature.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This method is used to validate the input directories when a job is + submitted so that the {@link JobClient} can fail early, with an useful + error message, in case of errors. For e.g. input directory does not exist. +

+ + @param job job configuration. + @throws InvalidInputException if the job does not have valid input + @deprecated getSplits is called in the client and can perform any + necessary validation of the input]]> +
+
+ + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. + + @param job job configuration. + @param numSplits the desired number of splits, a hint. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + It is the responsibility of the RecordReader to respect + record boundaries while processing the logical split to present a + record-oriented view to the individual task.

+ + @param split the {@link InputSplit} + @param job the job that this split belongs to + @return a {@link RecordReader}]]> +
+
+ + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibilty to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see JobClient + @see FileInputFormat]]> + + + + + + + + + + InputSplit. + + @return the number of bytes in the input split. + @throws IOException]]> + + + + + + InputSplit is + located as an array of Strings. + @throws IOException]]> + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + jobid doesn't correspond to any known job. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient is the primary interface for the user-job to interact + with the {@link JobTracker}. + + JobClient provides facilities to submit jobs, track their + progress, access component-tasks' reports/logs, get the Map-Reduce cluster + status information etc. + +

The job submission process involves: +

    +
  1. + Checking the input and output specifications of the job. +
  2. +
  3. + Computing the {@link InputSplit}s for the job. +
  4. +
  5. + Setup the requisite accounting information for the {@link DistributedCache} + of the job, if necessary. +
  6. +
  7. + Copying the job's jar and configuration to the map-reduce system directory + on the distributed file-system. +
  8. +
  9. + Submitting the job to the JobTracker and optionally monitoring + it's status. +
  10. +

+ + Normally the user creates the application, describes various facets of the + job via {@link JobConf} and then uses the JobClient to submit + the job and monitor its progress. + +

Here is an example on how to use JobClient:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     job.setInputPath(new Path("in"));
+     job.setOutputPath(new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+
+     // Submit the job, then poll for progress until the job is complete
+     JobClient.runJob(job);
+ 

+ +

Job Control

+ +

At times clients would chain map-reduce jobs to accomplish complex tasks + which cannot be done via a single map-reduce job. This is fairly easy since + the output of the job, typically, goes to distributed file-system and that + can be used as the input for the next job.

+ +

However, this also means that the onus on ensuring jobs are complete + (success/failure) lies squarely on the clients. In such situations the + various job-control options are: +

    +
  1. + {@link #runJob(JobConf)} : submits the job and returns only after + the job has completed. +
  2. +
  3. + {@link #submitJob(JobConf)} : only submits the job, then poll the + returned handle to the {@link RunningJob} to query status and make + scheduling decisions. +
  4. +
  5. + {@link JobConf#setJobEndNotificationURI(String)} : setup a notification + on job-completion, thus avoiding polling. +
  6. +

+ + @see JobConf + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if framework should keep the intermediate files + for failed tasks, false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Note: +

+ @param dir the {@link Path} of the output directory for the map-reduce job.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the outputs of the maps are to be compressed, + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This comparator should be provided if the equivalence rules for keys + for sorting the intermediates are different from those for grouping keys + before each call to + {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ +

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + in a single call to the reduce function if K1 and K2 compare as equal.

+ +

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + how keys are sorted, this can be used in conjunction to simulate + secondary sort on values.

+ +

Note: This is not a guarantee of the reduce sort being + stable in any sense. (In any case, with the order of available + map-outputs to the reduce being non-deterministic, it wouldn't make + that much sense.)

+ + @param theClass the comparator class to be used for grouping keys. + It should implement RawComparator. + @see #setOutputKeyComparatorClass(Class)]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. Typically the combiner is same as the + the {@link Reducer} for the job i.e. {@link #getReducerClass()}. + + @return the user-defined combiner class used to combine map-outputs.]]> + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. + +

The combiner is a task-level aggregation operation which, in some cases, + helps to cut down the amount of data transferred from the {@link Mapper} to + the {@link Reducer}, leading to better performance.

+ +

Typically the combiner is same as the Reducer for the + job i.e. {@link #setReducerClass(Class)}.

+ + @param theClass the user-defined combiner class used to combine + map-outputs.]]> +
+
+ + + + + + + + + + + true. + + @return true if speculative execution be used for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on, else false.]]> + + + + + true. + + @return true if speculative execution be + used for this job for map tasks, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for map tasks, + else false.]]> + + + + + true. + + @return true if speculative execution be used + for reduce tasks for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for reduce tasks, + else false.]]> + + + + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + Note: This is only a hint to the framework. The actual + number of spawned map tasks depends on the number of {@link InputSplit}s + generated by the job's {@link InputFormat#getSplits(JobConf, int)}. + + A custom {@link InputFormat} is typically used to accurately control + the number of map tasks for the job.

+ +

How many maps?

+ +

The number of maps is usually driven by the total size of the inputs + i.e. total number of blocks of the input files.

+ +

The right level of parallelism for maps seems to be around 10-100 maps + per-node, although it has been set up to 300 or so for very cpu-light map + tasks. Task setup takes awhile, so it is best if the maps take at least a + minute to execute.

+ +

The default behavior of file-based {@link InputFormat}s is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of input files. However, the {@link FileSystem} blocksize of the + input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Thus, if you expect 10TB of input data and have a blocksize of 128MB, + you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is + used to set it even higher.

+ + @param n the number of map tasks for this job. + @see InputFormat#getSplits(JobConf, int) + @see FileInputFormat + @see FileSystem#getDefaultBlockSize() + @see FileStatus#getBlockSize()]]> +
+
+ + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + How many reduces? + +

The right number of reduces seems to be 0.95 or + 1.75 multiplied by (<no. of nodes> * + + mapred.tasktracker.reduce.tasks.maximum). +

+ +

With 0.95 all of the reduces can launch immediately and + start transfering map outputs as the maps finish. With 1.75 + the faster nodes will finish their first round of reduces and launch a + second wave of reduces doing a much better job of load balancing.

+ +

Increasing the number of reduces increases the framework overhead, but + increases load balancing and lowers the cost of failures.

+ +

The scaling factors above are slightly less than whole numbers to + reserve a few reduce slots in the framework for speculative-tasks, failures + etc.

+ +

Reducer NONE

+ +

It is legal to set the number of reduce-tasks to zero.

+ +

In this case the output of the map-tasks directly go to distributed + file-system, to the path set by + {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the + framework doesn't sort the map-outputs before writing it out to HDFS.

+ + @param n the number of reduce tasks for this job.]]> +
+
+ + + mapred.map.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per map task.]]> + + + + + + + + + + + mapred.reduce.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per reduce task.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noFailures, the + tasktracker is blacklisted for this job. + + @param noFailures maximum no. of failures of a given job per tasktracker.]]> + + + + + blacklisted for this job. + + @return the maximum no. of failures of a given job per tasktracker.]]> + + + + + failed. + + Defaults to zero, i.e. any failed map-task results in + the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + failed. + + Defaults to zero, i.e. any failed reduce-task results + in the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The debug script can aid debugging of failed map tasks. The script is + given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script needs to be symlinked.

+ +

Here is an example on how to submit a script +

+ job.setMapDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param mDbgScript the script name]]> +
+
+ + + + + + + + + The debug script can aid debugging of failed reduce tasks. The script + is given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script file needs to be symlinked

+ +

Here is an example on how to submit a script +

+ job.setReduceDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param rDbgScript the script name]]> +
+
+ + + + + + + + null if it hasn't + been set. + @see #setJobEndNotificationURI(String)]]> + + + + + + The uri can contain 2 special parameters: $jobId and + $jobStatus. Those, if present, are replaced by the job's + identifier and completion-status respectively.

+ +

This is typically used by application-writers to implement chaining of + Map-Reduce jobs in an asynchronous manner.

+ + @param uri the job end notification uri + @see JobStatus + @see Job Completion and Chaining]]> +
+
+ + + + When a job starts, a shared directory is created at location + + ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ . + This directory is exposed to the users through + job.local.dir . + So, the tasks can use this space + as scratch space and share files among them.

+ This value is available as System property also. + + @return The localized job specific shared directory]]> +
+
+ + JobConf is the primary interface for a user to describe a + map-reduce job to the Hadoop framework for execution. The framework tries to + faithfully execute the job as-is described by JobConf, however: +
    +
  1. + Some configuration parameters might have been marked as + + final by administrators and hence cannot be altered. +
  2. +
  3. + While some job parameters are straight-forward to set + (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly + rest of the framework and/or job-configuration and is relatively more + complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}). +
  4. +

+ +

JobConf typically specifies the {@link Mapper}, combiner + (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and + {@link OutputFormat} implementations to be used etc. + +

Optionally JobConf is used to specify other advanced facets + of the job such as Comparators to be used, files to be put in + the {@link DistributedCache}, whether or not intermediate and/or job outputs + are to be compressed (and how), debugability via user-provided scripts + ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), + for doing post-processing on task logs, task's stdout, stderr, syslog. + and etc.

+ +

Here is an example on how to configure a job via JobConf:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     FileInputFormat.setInputPaths(job, new Path("in"));
+     FileOutputFormat.setOutputPath(job, new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setCombinerClass(MyJob.MyReducer.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+     
+     job.setInputFormat(SequenceFileInputFormat.class);
+     job.setOutputFormat(SequenceFileOutputFormat.class);
+ 

+ + @see JobClient + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + any job + run on the jobtracker started at 200707121733, we would use : +
 
+ JobID.getTaskIDsPattern("200707121733", null);
+ 
+ which will return : +
 "job_200707121733_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @return a regex pattern matching JobIDs]]> +
+
+ + + An example JobID is : + job_200707121733_0003 , which represents the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse JobID strings, but rather + use appropriate constructors or {@link #forName(String)} method. + + @see TaskID + @see TaskAttemptID + @see JobTracker#getNewJobId() + @see JobTracker#getStartTime()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -archives + -files inputjar args]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + zero. + + @param conf configuration for the JobTracker. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + io.file.buffer.size specified in the given + Configuration. + @param in input stream + @param conf configuration + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Output pairs need not be of the same types as input pairs. A given + input pair may map to zero or many output pairs. Output pairs are + collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the input key. + @param value the input value. + @param output collects mapped keys and values. + @param reporter facility to report progress.]]> +
+ + + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link JobConf} for the + job via the {@link JobConfigurable#configure(JobConf)} and initialize + themselves. Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

The framework then calls + {@link #map(Object, Object, OutputCollector, Reporter)} + for each key/value pair in the InputSplit for that task.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the grouping by specifying + a Comparator via + {@link JobConf#setOutputKeyComparatorClass(Class)}.

+ +

The grouped Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

The intermediate, grouped outputs are always stored in + {@link SequenceFile}s. Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the JobConf.

+ +

If the job has + zero + reduces then the output of the Mapper is directly written + to the {@link FileSystem} without grouping by keys.

+ +

Example:

+

+     public class MyMapper<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Mapper<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+       
+       private String mapTaskId;
+       private String inputFile;
+       private int noRecords = 0;
+       
+       public void configure(JobConf job) {
+         mapTaskId = job.get("mapred.task.id");
+         inputFile = job.get("mapred.input.file");
+       }
+       
+       public void map(K key, V val,
+                       OutputCollector<K, V> output, Reporter reporter)
+       throws IOException {
+         // Process the <key, value> pair (assume this takes a while)
+         // ...
+         // ...
+         
+         // Let the framework know that we are alive, and kicking!
+         // reporter.progress();
+         
+         // Process some more
+         // ...
+         // ...
+         
+         // Increment the no. of <key, value> pairs processed
+         ++noRecords;
+
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+        
+         // Every 100 records update application-level status
+         if ((noRecords%100) == 0) {
+           reporter.setStatus(mapTaskId + " processed " + noRecords + 
+                              " from input-file: " + inputFile); 
+         }
+         
+         // Output the result
+         output.collect(key, val);
+       }
+     }
+ 

+ +

Applications may write a custom {@link MapRunnable} to exert greater + control on map processing e.g. multi-threaded Mappers etc.

+ + @see JobConf + @see InputFormat + @see Partitioner + @see Reducer + @see MapReduceBase + @see MapRunnable + @see SequenceFile]]> +
+
+ + + + + + + + + + + + + + + + + + + + + Provides default no-op implementations for a few methods, most non-trivial + applications need to override some of them.

]]> +
+
+ + + + + + + + + + + <key, value> pairs. + +

Mapping of input records to output records is complete when this method + returns.

+ + @param input the {@link RecordReader} to read the input records. + @param output the {@link OutputCollector} to collect the outputrecords. + @param reporter {@link Reporter} to report progress, status-updates etc. + @throws IOException]]> +
+
+ + Custom implementations of MapRunnable can exert greater + control on map processing e.g. multi-threaded, asynchronous mappers etc.

+ + @see Mapper]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly + equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)} + to construct RecordReader's for MultiFileSplit's. + @see MultiFileSplit]]> +
+
+ + + + + + + + + + + + + + + + + th Path]]> + + + + + + + + + + + th Path]]> + + + + + + + + + + + + + + + + + + + + + + + MultiFileSplit can be used to implement {@link RecordReader}'s, with + reading one record per file. + @see FileSplit + @see MultiFileInputFormat]]> + + + + + + + + + + + + + + + <key, value> pairs output by {@link Mapper}s + and {@link Reducer}s. + +

OutputCollector is the generalization of the facility + provided by the Map-Reduce framework to collect data output by either the + Mapper or the Reducer i.e. intermediate outputs + or the output of the job.

]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param ignored + @param job job configuration. + @throws IOException when output should not be attempted]]> +
+
+ + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter + @see JobConf]]> +
+
+ + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be paritioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0 to 1.0. + @throws IOException]]> + + + + RecordReader reads <key, value> pairs from an + {@link InputSplit}. + +

RecordReader, typically, converts the byte-oriented view of + the input, provided by the InputSplit, and presents a + record-oriented view for the {@link Mapper} & {@link Reducer} tasks for + processing. It thus assumes the responsibility of processing record + boundaries and presenting the tasks with keys and values.

+ + @see InputSplit + @see InputFormat]]> +
+
+ + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param reporter facility to report progress. + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + Reduces values for a given key. + +

The framework calls this method for each + <key, (list of values)> pair in the grouped inputs. + Output values must be of the same type as input values. Input keys must + not be altered. The framework will reuse the key and value objects + that are passed into the reduce, therefore the application should clone + the objects they want to keep a copy of. In many cases, all values are + combined into zero or one value. +

+ +

Output pairs are collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the key. + @param values the list of values to reduce. + @param output to collect keys and combined values. + @param reporter facility to report progress.]]> +
+ + + The number of Reducers for the job is set by the user via + {@link JobConf#setNumReduceTasks(int)}. Reducer implementations + can access the {@link JobConf} for the job via the + {@link JobConfigurable#configure(JobConf)} method and initialize themselves. + Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    Reducer is input the grouped output of a {@link Mapper}. + In the phase the framework, for each Reducer, fetches the + relevant partition of the output of all the Mappers, via HTTP. +

    +
  2. + +
  3. +

    Sort

    + +

    The framework groups Reducer inputs by keys + (since different Mappers may have output the same key) in this + stage.

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    If equivalence rules for keys while grouping the intermediates are + different from those for grouping keys before reduction, then one may + specify a Comparator via + {@link JobConf#setOutputValueGroupingComparator(Class)}.Since + {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to + control how intermediate keys are grouped, these can be used in conjunction + to simulate secondary sort on values.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterator, OutputCollector, Reporter)} + method is called for each <key, (list of values)> pair in + the grouped inputs.

    +

    The output of the reduce task is typically written to the + {@link FileSystem} via + {@link OutputCollector#collect(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+     public class MyReducer<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Reducer<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+        
+       private String reduceTaskId;
+       private int noKeys = 0;
+       
+       public void configure(JobConf job) {
+         reduceTaskId = job.get("mapred.task.id");
+       }
+       
+       public void reduce(K key, Iterator<V> values,
+                          OutputCollector<K, V> output, 
+                          Reporter reporter)
+       throws IOException {
+       
+         // Process
+         int noValues = 0;
+         while (values.hasNext()) {
+           V value = values.next();
+           
+           // Increment the no. of values for this key
+           ++noValues;
+           
+           // Process the <key, value> pair (assume this takes a while)
+           // ...
+           // ...
+           
+           // Let the framework know that we are alive, and kicking!
+           if ((noValues%10) == 0) {
+             reporter.progress();
+           }
+         
+           // Process some more
+           // ...
+           // ...
+           
+           // Output the <key, value> 
+           output.collect(key, value);
+         }
+         
+         // Increment the no. of <key, list of values> pairs processed
+         ++noKeys;
+         
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+         
+         // Every 100 keys update application-level status
+         if ((noKeys%100) == 0) {
+           reporter.setStatus(reduceTaskId + " processed " + noKeys);
+         }
+       }
+     }
+ 

+ + @see Mapper + @see Partitioner + @see Reporter + @see MapReduceBase]]> +
+
+ + + + + + + + + + + + + + + Enum. + @param amount A non-negative amount by which the counter is to + be incremented.]]> + + + + + + + + + + + + + + InputSplit that the map is reading from. + @throws UnsupportedOperationException if called outside a mapper]]> + + + + + + + + + {@link Mapper} and {@link Reducer} can use the Reporter + provided to report progress or just indicate that they are alive. In + scenarios where the application takes an insignificant amount of time to + process individual key/value pairs, this is crucial since the framework + might assume that the task has timed-out and kill that task. + +

Applications can also update {@link Counters} via the provided + Reporter .

+ + @see Progressable + @see Counters]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RunningJob is the user-interface to query for details on a + running Map-Reduce job. + +

Clients can get hold of RunningJob via the {@link JobClient} + and then query the running-job for details such as name, configuration, + progress etc.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This allows the user to specify the key class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + This allows the user to specify the value class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f. The filtering criteria is + MD5(key) % f == 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + f using + the criteria record# % f == 0. + For example, if the frequency is 10, one out of 10 records is returned.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + @param name The name of the server + @param port The port to use on the server + @param findPort whether the server should start at the given port and + increment by 1 until it finds a free port.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points to the log directory + "/static/" -> points to common static files (src/webapps/static) + "/" -> the jsp server code from (src/webapps/)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all task attempt IDs + of any jobtracker, in any job, of the first + map task, we would use : +
 
+ TaskAttemptID.getTaskAttemptIDsPattern(null, null, true, 1, null);
+ 
+ which will return : +
 "attempt_[^_]*_[0-9]*_m_000001_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @param attemptId the task attempt number, or null + @return a regex pattern matching TaskAttemptIDs]]> +
+
+ + + An example TaskAttemptID is : + attempt_200707121733_0003_m_000005_0 , which represents the + zeroth task attempt for the fifth map task in the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse TaskAttemptID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the first map task + of any jobtracker, of any job, we would use : +

 
+ TaskID.getTaskIDsPattern(null, null, true, 1);
+ 
+ which will return : +
 "task_[^_]*_[0-9]*_m_000001*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @return a regex pattern matching TaskIDs]]> +
+ + + + An example TaskID is : + task_200707121733_0003_m_000005 , which represents the + fifth map task in the third job running at the jobtracker + started at 200707121733. +

+ Applications should never construct or parse TaskID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.log.dir.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the Job was added.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ([,]*) + func ::= tbl(,"") + class ::= @see java.lang.Class#forName(java.lang.String) + path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) + } + Reads expression from the mapred.join.expr property and + user-supplied join types from mapred.join.define.<ident> + types. Paths supplied to tbl are given as input paths to the + InputFormat class listed. + @see #compose(java.lang.String, java.lang.Class, java.lang.String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ,

) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + mapred.join.define.<ident> to a classname. In the expression + mapred.join.expr, the identifier will be assumed to be a + ComposableRecordReader. + mapred.join.keycomparator can be a classname used to compare keys + in the join. + @see JoinRecordReader + @see MultiFilterRecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + capacity children to position + id in the parent reader. + The id of a root CompositeRecordReader is -1 by convention, but relying + on this is not recommended.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + override(S1,S2,S3) will prefer values + from S3 over S2, and values from S2 over S1 for all keys + emitted from all sources.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [,,...,]]]> + + + + + + + out. + TupleWritable format: + {@code + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Map implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured to use this MapRunnable class (using + the JobConf.setMapRunnerClass method) and + the number of thread the thread-pool can use with the + mapred.map.multithreadedrunner.threads property, its default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pairs. Uses + {@link StringTokenizer} to break text into tokens.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + generateKeyValPairs(Object key, Object value); public void + configure(JobConfjob); } + + The package also provides a base class, ValueAggregatorBaseDescriptor, + implementing the above interface. The user can extend the base class and + implement generateKeyValPairs accordingly. + + The primary work of generateKeyValPairs is to emit one or more key/value + pairs based on the input key/value pair. The key in an output key/value pair + encode two pieces of information: aggregation type and aggregation id. The + value will be aggregated onto the aggregation id according the aggregation + type. + + This class offers a function to generate a map/reduce job using Aggregate + framework. The function takes the following parameters: input directory spec + input format (text or sequence file) output directory a file specifying the + user plugin class]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When constructing the instance, if the factory property + contextName.class exists, + its value is taken to be the name of the class to instantiate. Otherwise, + the default is to create an instance of + org.apache.hadoop.metrics.spi.NullContext, which is a + dummy "no-op" context which will cause all metric data to be discarded. + + @param contextName the name of the context + @return the named MetricsContext]]> + + + + + + + + + + + + + + When the instance is constructed, this method checks if the file + hadoop-metrics.properties exists on the class path. If it + exists, it must be in the format defined by java.util.Properties, and all + the properties in the file are set as attributes on the newly created + ContextFactory instance. + + @return the singleton ContextFactory instance]]> + + + + getFactory() method.]]> + + + + + + + + + + + + + + + + + + + startMonitoring() again after calling + this. + @see #close()]]> + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record name identifies the kind of data to be reported. For example, a + program reporting statistics relating to the disks on a computer might use + a record name "diskStats".

+ + A record has zero or more tags. A tag has a name and a value. To + continue the example, the "diskStats" record might use a tag named + "diskName" to identify a particular disk. Sometimes it is useful to have + more than one tag, so there might also be a "diskType" with value "ide" or + "scsi" or whatever.

+ + A record also has zero or more metrics. These are the named + values that are to be reported to the metrics system. In the "diskStats" + example, possible metric names would be "diskPercentFull", "diskPercentBusy", + "kbReadPerSecond", etc.

+ + The general procedure for using a MetricsRecord is to fill in its tag and + metric values, and then call update() to pass the record to the + client library. + Metric data is not immediately sent to the metrics system + each time that update() is called. + An internal table is maintained, identified by the record name. This + table has columns + corresponding to the tag and the metric names, and rows + corresponding to each unique set of tag values. An update + either modifies an existing row in the table, or adds a new row with a set of + tag values that are different from all the other rows. Note that if there + are no tags, then there can be at most one row in the table.

+ + Once a row is added to the table, its data will be sent to the metrics system + on every timer period, whether or not it has been updated since the previous + timer period. If this is inappropriate, for example if metrics were being + reported by some transient object in an application, the remove() + method can be used to remove the row and thus stop the data from being + sent.

+ + Note that the update() method is atomic. This means that it is + safe for different threads to be updating the same metric. More precisely, + it is OK for different threads to call update() on MetricsRecord instances + with the same set of tag names and tag values. Different threads should + not use the same MetricsRecord instance at the same time.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MetricsContext.registerUpdater().]]> + + + + + + + + + + + + + + + + + + + + + + + + + fileName attribute, + if specified. Otherwise the data will be written to standard + output.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is configured by setting ContextFactory attributes which in turn + are usually configured through a properties file. All the attributes are + prefixed by the contextName. For example, the properties file might contain: +

+ myContextName.fileName=/tmp/metrics.log
+ myContextName.period=5
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contextName.tableName. The returned map consists of + those attributes with the contextName and tableName stripped off.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class implements the internal table of metric data, and the timer + on which data is to be sent to the metrics system. Subclasses must + override the abstract emitRecord method in order to transmit + the data.

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + update + and remove().]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hostname or hostname:port. If + the specs string is null, defaults to localhost:defaultPort. + + @return a list of InetSocketAddress objects.]]> + + + + + + + + + + + + + + + + + + + ,name=" + Where the and are the supplied parameters + + @param serviceName + @param nameName + @param theMbean - the MBean to register + @return the named used to register the MBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.rpc.socket.factory.class.<ClassName>. When no + such parameter exists then fall back on the default socket factory as + configured by hadoop.rpc.socket.factory.class.default. If + this default socket factory is not configured, then fall back on the JVM + default socket factory. + + @param conf the configuration + @param clazz the class (usually a {@link VersionedProtocol}) + @return a socket factory]]> + + + + + + hadoop.rpc.socket.factory.default + + @param conf the configuration + @return the default socket factory as specified in the configuration or + the JVM default socket factory if the configuration does not + contain a default socket factory property.]]> + + + + + + + + + + + + + : + ://:/]]> + + + + + + + + : + ://:/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketInputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getInputStream()} is returned. In the later + case, the timeout argument is ignored and the timeout set with + {@link Socket#setSoTimeout(int)} applies for reads.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see #getInputStream(Socket, long) + + @param socket + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + +
+ + From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketOutputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getOutputStream()} is returned. In the later + case, the timeout argument is ignored and the write will wait until + data is available.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see #getOutputStream(Socket, long) + + @param socket + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+
+ + + + + + + + + + + + + + + + + + + + + node + + @param node + a node + @return true if node is already in the tree; false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scope + if scope starts with ~, choose one from the all nodes except for the + ones in scope; otherwise, choose one from scope + @param scope range of nodes from which a node will be choosen + @return the choosen node]]> + + + + + + + scope but not in excludedNodes + if scope starts with ~, return the number of nodes that are not + in scope and excludedNodes; + @param scope a path string that may start with ~ + @param excludedNodes a list of nodes + @return number of available nodes]]> + + + + + + + + + + + + reader + It linearly scans the array, if a local node is found, swap it with + the first element of the array. + If a local rack node is found, swap it with the first element following + the local node. + If neither local node or local rack node is found, put a random replica + location at postion 0. + It leaves the rest nodes untouched.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + +
+ + Create a new ouput stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = getCount(). + @param newCapacity The new capacity in bytes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index idx = startVector(...); + while (!idx.done()) { + .... // read element of a vector + idx.incr(); + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This task takes the given record definition files and compiles them into + java or c++ + files. It is then up to the user to compile the generated files. + +

The task requires the file or the nested fileset element to be + specified. Optional attributes are language (set the output + language, default is "java"), + destdir (name of the destination directory for generated java/c++ + code, default is ".") and failonerror (specifies error handling + behavior. default is true). +

Usage

+
+ <recordcc
+       destdir="${basedir}/gensrc"
+       language="java">
+   <fileset include="**\/*.jr" />
+ </recordcc>
+ 
]]> +
+
+ +
+ + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi as a comma separated string in + conf as a property attr + + The String starts with the user name followed by the default group names, + and other group names. + + @param conf configuration + @param attr property name + @param ugi a UnixUserGroupInformation]]> + + + + + + + + conf + + The object is expected to store with the property name attr + as a comma separated string that starts + with the user name followed by group names. + If the property name is not defined, return null. + It's assumed that there is only one UGI per user. If this user already + has a UGI in the ugi map, return the ugi in the map. + Otherwise, construct a UGI from the configuration, store it in the + ugi map and return it. + + @param conf configuration + @param attr property name + @return a UnixUGI + @throws LoginException if the stored string is ill-formatted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool supports archiving and anaylzing (sort/grep) of log-files. + It takes as input + a) Input uri which will serve uris of the logs to be archived. + b) Output directory (not mandatory). + b) Directory on dfs to archive the logs. + c) The sort/grep patterns for analyzing the files and separator for boundaries. + Usage: + Logalyzer -archive -archiveDir -analysis -logs -grep -sort -separator +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GenericOptionsParser to parse only the generic Hadoop + arguments. + + The array of string arguments other than the generic arguments can be + obtained by {@link #getRemainingArgs()}. + + @param conf the Configuration to modify. + @param args command-line arguments.]]> + + + + + GenericOptionsParser to parse given options as well + as generic Hadoop options. + + The resulting CommandLine object can be obtained by + {@link #getCommandLine()}. + + @param conf the configuration to modify + @param options options built by the caller + @param args User-specified arguments]]> + + + + + Strings containing the un-parsed arguments + or empty array if commandLine was not defined.]]> + + + + + CommandLine object + to process the parsed arguments. + + Note: If the object is created with + {@link #GenericOptionsParser(Configuration, String[])}, then returned + object will only contain parsed generic options. + + @return CommandLine representing list of arguments + parsed against Options descriptor.]]> + + + + + + + + + + GenericOptionsParser is a utility to parse command line + arguments generic to the Hadoop framework. + + GenericOptionsParser recognizes several standarad command + line arguments, enabling applications to easily specify a namenode, a + jobtracker, additional configuration resources etc. + +

Generic Options

+ +

The supported generic options are:

+

+     -conf <configuration file>     specify a configuration file
+     -D <property=value>            use value for given property
+     -fs <local|namenode:port>      specify a namenode
+     -jt <local|jobtracker:port>    specify a job tracker
+     -files <comma separated list of files>    specify comma separated
+                            files to be copied to the map reduce cluster
+     -libjars <comma separated list of jars>   specify comma separated
+                            jar files to include in the classpath.
+     -archives <comma separated list of archives>    specify comma
+             separated archives to be unarchived on the compute machines.
+
+ 

+ +

The general command line syntax is:

+

+ bin/hadoop command [genericOptions] [commandOptions]
+ 

+ +

Generic command line arguments might modify + Configuration objects, given to constructors.

+ +

The functionality is implemented using Commons CLI.

+ +

Examples:

+

+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+ 
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+     
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+     
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+ 
+ $ bin/hadoop jar -libjars testlib.jar 
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+ 

+ + @see Tool + @see ToolRunner]]> +
+
+ + + + + + + + + Class<T>) of the + argument of type T. + @param The type of the argument + @param t the object to get it class + @return Class<T>]]> + + + + + + + List<T> to a an array of + T[]. + @param c the Class object of the items in the list + @param list the list to convert]]> + + + + + + List<T> to a an array of + T[]. + @param list the list to convert + @throws ArrayIndexOutOfBoundsException if the list is empty. + Use {@link #toArray(Class, List)} if the list may be empty.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-hadoop is loaded, + else false]]> + + + + + + true if native hadoop libraries, if present, can be + used for this job; false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { pq.top().change(); pq.adjustTop(); } + instead of
+  { o = pq.pop(); o.change(); pq.push(o); }
+ 
]]> +
+
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clients and/or applications can use the provided Progressable + to explicitly report progress to the Hadoop framework. This is especially + important for operations which take an insignificant amount of time since, + in-lieu of the reported progress, the framework has to assume that an error + has occured and time-out the operation.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Class is to be obtained + @return the correctly typed Class of the given object.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop Pipes + or Hadoop Streaming. + + It also checks to ensure that we are running on a *nix platform else + (e.g. in Cygwin/Windows) it returns null. + @param job job configuration + @return a String[] with the ulimit command arguments or + null if we are running on a non *nix platform or + if the limit is unspecified.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell interface. + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell can be used to run unix commands like du or + df. It also offers facilities to gate commands by + time-intervals.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ShellCommandExecutorshould be used in cases where the output + of the command needs no explicit parsing and where the command, working + directory and the environment remains unchanged. The output of the command + is stored as-is and is expected to be small.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ArrayList of string values]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the char to be escaped + @return an escaped string]]> + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the escaped char + @return an unescaped string]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tool, is the standard for any Map-Reduce tool/application. + The tool/application should delegate the handling of + + standard command-line options to {@link ToolRunner#run(Tool, String[])} + and only handle its custom arguments.

+ +

Here is how a typical Tool is implemented:

+

+     public class MyApp extends Configured implements Tool {
+     
+       public int run(String[] args) throws Exception {
+         // Configuration processed by ToolRunner
+         Configuration conf = getConf();
+         
+         // Create a JobConf using the processed conf
+         JobConf job = new JobConf(conf, MyApp.class);
+         
+         // Process custom command-line options
+         Path in = new Path(args[1]);
+         Path out = new Path(args[2]);
+         
+         // Specify various job-specific parameters     
+         job.setJobName("my-app");
+         job.setInputPath(in);
+         job.setOutputPath(out);
+         job.setMapperClass(MyApp.MyMapper.class);
+         job.setReducerClass(MyApp.MyReducer.class);
+
+         // Submit the job, then poll for progress until the job is complete
+         JobClient.runJob(job);
+       }
+       
+       public static void main(String[] args) throws Exception {
+         // Let ToolRunner handle generic command-line options 
+         int res = ToolRunner.run(new Configuration(), new Sort(), args);
+         
+         System.exit(res);
+       }
+     }
+ 

+ + @see GenericOptionsParser + @see ToolRunner]]> +
+
+ + + + + + + + + + + + Tool by {@link Tool#run(String[])}, after + parsing with the given generic arguments. Uses the given + Configuration, or builds one if null. + + Sets the Tool's configuration with the possibly modified + version of the conf. + + @param conf Configuration for the Tool. + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + Tool with its Configuration. + + Equivalent to run(tool.getConf(), tool, args). + + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + + + ToolRunner can be used to run classes implementing + Tool interface. It works in conjunction with + {@link GenericOptionsParser} to parse the + + generic hadoop command line arguments and modifies the + Configuration of the Tool. The + application-specific options are passed along without being modified. +

+ + @see Tool + @see GenericOptionsParser]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
diff --git a/lib/jdiff/hadoop_0.18.3.xml b/lib/jdiff/hadoop_0.18.3.xml new file mode 100644 index 00000000000..564916fef77 --- /dev/null +++ b/lib/jdiff/hadoop_0.18.3.xml @@ -0,0 +1,38826 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + final. + + @param name resource to be added, the classpath is examined for a file + with that name.]]> + + + + + + final. + + @param url url of the resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param file file-path of resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + name property, null if + no such property exists. + + Values are processed for variable expansion + before being returned. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + name property, without doing + variable expansion. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + + value of the name property. + + @param name property name. + @param value property value.]]> + + + + + + + name property. If no such property + exists, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value, or defaultValue if the property + doesn't exist.]]> + + + + + + + name property as an int. + + If no such property exists, or if the specified value is not a valid + int, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as an int, + or defaultValue.]]> + + + + + + + name property to an int. + + @param name property name. + @param value int value of the property.]]> + + + + + + + name property as a long. + If no such property is specified, or if the specified value is not a valid + long, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a long, + or defaultValue.]]> + + + + + + + name property to a long. + + @param name property name. + @param value long value of the property.]]> + + + + + + + name property as a float. + If no such property is specified, or if the specified value is not a valid + float, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a float, + or defaultValue.]]> + + + + + + + name property as a boolean. + If no such property is specified, or if the specified value is not a valid + boolean, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a boolean, + or defaultValue.]]> + + + + + + + name property to a boolean. + + @param name property name. + @param value boolean value of the property.]]> + + + + + + + + + + + + + name property as + a collection of Strings. + If no such property is specified then empty collection is returned. +

+ This is an optimized version of {@link #getStrings(String)} + + @param name property name. + @return property value as a collection of Strings.]]> + + + + + + name property as + an array of Strings. + If no such property is specified then null is returned. + + @param name property name. + @return property value as an array of Strings, + or null.]]> + + + + + + + name property as + an array of Strings. + If no such property is specified then default value is returned. + + @param name property name. + @param defaultValue The default value + @return property value as an array of Strings, + or default value.]]> + + + + + + + name property as + as comma delimited values. + + @param name property name. + @param values The values]]> + + + + + + + + + + + + + + name property as a Class. + If no such property is specified, then defaultValue is + returned. + + @param name the class name. + @param defaultValue default value. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property as a Class + implementing the interface specified by xface. + + If no such property is specified, then defaultValue is + returned. + + An exception is thrown if the returned class does not implement the named + interface. + + @param name the class name. + @param defaultValue default value. + @param xface the interface implemented by the named class. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property to the name of a + theClass implementing the given interface xface. + + An exception is thrown if theClass does not implement the + interface xface. + + @param name property name. + @param theClass property value. + @param xface the interface implemented by the named class.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + + + + + name. + + @param name configuration resource name. + @return an input stream attached to the resource.]]> + + + + + + name. + + @param name configuration resource name. + @return a reader attached to the resource.]]> + + + + + String + key-value pairs in the configuration. + + @return an iterator over the entries.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + true to set quiet-mode on, false + to turn it off.]]> + + + + + + + + + + + Resources + +

Configurations are specified by resources. A resource contains a set of + name/value pairs as XML data. Each resource is named by either a + String or by a {@link Path}. If named by a String, + then the classpath is examined for a file with that name. If named by a + Path, then the local filesystem is examined directly, without + referring to the classpath. + +

Hadoop by default specifies two resources, loaded in-order from the + classpath:

    +
  1. hadoop-default.xml + : Read-only defaults for hadoop.
  2. +
  3. hadoop-site.xml: Site-specific configuration for a given hadoop + installation.
  4. +
+ Applications may add additional resources, which are loaded + subsequent to these resources in the order they are added. + +

Final Parameters

+ +

Configuration parameters may be declared final. + Once a resource declares a value final, no subsequently-loaded + resource can alter that value. + For example, one might define a final parameter with: +

+  <property>
+    <name>dfs.client.buffer.dir</name>
+    <value>/tmp/hadoop/dfs/client</value>
+    <final>true</final>
+  </property>
+ + Administrators typically define parameters as final in + hadoop-site.xml for values that user applications may not alter. + +

Variable Expansion

+ +

Value strings are first processed for variable expansion. The + available properties are:

    +
  1. Other properties defined in this Configuration; and, if a name is + undefined here,
  2. +
  3. Properties in {@link System#getProperties()}.
  4. +
+ +

For example, if a configuration resource contains the following property + definitions: +

+  <property>
+    <name>basedir</name>
+    <value>/user/${user.name}</value>
+  </property>
+  
+  <property>
+    <name>tempdir</name>
+    <value>${basedir}/tmp</value>
+  </property>
+ + When conf.get("tempdir") is called, then ${basedir} + will be resolved to another property in this Configuration, while + ${user.name} would then ordinarily be resolved to the value + of the System property with that name.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DistributedCache is a facility provided by the Map-Reduce + framework to cache files (text, archives, jars etc.) needed by applications. +

+ +

Applications specify the files, via urls (hdfs:// or http://) to be cached + via the {@link JobConf}. The DistributedCache assumes that the + files specified via hdfs:// urls are already present on the + {@link FileSystem} at the path specified by the url.

+ +

The framework will copy the necessary files on to the slave node before + any tasks for the job are executed on that node. Its efficiency stems from + the fact that the files are only copied once per job and the ability to + cache archives which are un-archived on the slaves.

+ +

DistributedCache can be used to distribute simple, read-only + data/text files and/or more complex types such as archives, jars etc. + Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. + Jars may be optionally added to the classpath of the tasks, a rudimentary + software distribution mechanism. Files have execution permissions. + Optionally users can also direct it to symlink the distributed cache file(s) + into the working directory of the task.

+ +

DistributedCache tracks modification timestamps of the cache + files. Clearly the cache files should not be modified by the application + or externally while the job is executing.

+ +

Here is an illustrative example on how to use the + DistributedCache:

+

+     // Setting up the cache for the application
+     
+     1. Copy the requisite files to the FileSystem:
+     
+     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
+     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
+     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+     
+     2. Setup the application's JobConf:
+     
+     JobConf job = new JobConf();
+     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
+                                   job);
+     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+     
+     3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+     
+     public static class MapClass extends MapReduceBase  
+     implements Mapper<K, V, K, V> {
+     
+       private Path[] localArchives;
+       private Path[] localFiles;
+       
+       public void configure(JobConf job) {
+         // Get the cached archives/files
+         localArchives = DistributedCache.getLocalCacheArchives(job);
+         localFiles = DistributedCache.getLocalCacheFiles(job);
+       }
+       
+       public void map(K key, V value, 
+                       OutputCollector<K, V> output, Reporter reporter) 
+       throws IOException {
+         // Use data from the cached archives/files here
+         // ...
+         // ...
+         output.collect(k, v);
+       }
+     }
+     
+ 

+ + @see JobConf + @see JobClient]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BufferedFSInputStream + with the specified buffer size, + and saves its argument, the input stream + in, for later use. An internal + buffer array of length size + is created and stored in buf. + + @param in the underlying input stream. + @param size the buffer size. + @exception IllegalArgumentException if size <= 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + setReplication of FileSystem + @param src file name + @param replication new replication + @throws IOException + @return true if successful; + false if file does not exist or is a directory]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ']]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fs.scheme.class whose value names the FileSystem class. + The entire URI is passed to the FileSystem instance's initialize method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Return all the files that match filePattern and are not checksum + files. Results are sorted by their names. + +

+ A filename pattern is composed of regular characters and + special pattern matching characters, which are: + +

+
+
+

+

? +
Matches any single character. + +

+

* +
Matches zero or more characters. + +

+

[abc] +
Matches a single character from character set + {a,b,c}. + +

+

[a-b] +
Matches a single character from the character range + {a...b}. Note that character a must be + lexicographically less than or equal to character b. + +

+

[^a] +
Matches a single character that is not from character set or range + {a}. Note that the ^ character must occur + immediately to the right of the opening bracket. + +

+

\c +
Removes (escapes) any special meaning of character c. + +

+

{ab,cd} +
Matches a string from the string set {ab, cd} + +

+

{ab,c{de,fh}} +
Matches a string from the string set {ab, cde, cfh} + +
+
+
+ + @param pathPattern a regular expression specifying a pth pattern + + @return an array of paths that match the path pattern + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All user code that may potentially use the Hadoop Distributed + File System should be written to use a FileSystem object. The + Hadoop DFS is a multi-machine system that appears as a single + disk. It's useful because of its fault tolerance and potentially + very large capacity. + +

+ The local implementation is {@link LocalFileSystem} and distributed + implementation is {@link DistributedFileSystem}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FilterFileSystem contains + some other file system, which it uses as + its basic file system, possibly transforming + the data along the way or providing additional + functionality. The class FilterFileSystem + itself simply overrides all methods of + FileSystem with versions that + pass all requests to the contained file + system. Subclasses of FilterFileSystem + may further override some of these methods + and may also provide additional methods + and fields.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + buf at offset + and checksum into checksum. + The method is used for implementing read, therefore, it should be optimized + for sequential reading + @param pos chunkPos + @param buf desitination buffer + @param offset offset in buf at which to store data + @param len maximun number of bytes to read + @return number of bytes read]]> + + + + + + + + + + + + + + + + + -1 if the end of the + stream is reached. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + This method implements the general contract of the corresponding + {@link InputStream#read(byte[], int, int) read} method of + the {@link InputStream} class. As an additional + convenience, it attempts to read as many bytes as possible by repeatedly + invoking the read method of the underlying stream. This + iterated read continues until one of the following + conditions becomes true:

    + +
  • The specified number of bytes have been read, + +
  • The read method of the underlying stream returns + -1, indicating end-of-file. + +
If the first read on the underlying stream returns + -1 to indicate end-of-file then this method returns + -1. Otherwise this method returns the number of bytes + actually read. + + @param b destination buffer. + @param off offset at which to start storing bytes. + @param len maximum number of bytes to read. + @return the number of bytes read, or -1 if the end of + the stream has been reached. + @exception IOException if an I/O error occurs. + ChecksumException if any checksum error occurs]]> +
+ + + + + + + + + + + + n bytes of data from the + input stream. + +

This method may skip more bytes than are remaining in the backing + file. This produces no exception and the number of bytes skipped + may include some number of bytes that were beyond the EOF of the + backing file. Attempting to read from the stream after skipping past + the end will result in -1 indicating the end of the file. + +

If n is negative, no bytes are skipped. + + @param n the number of bytes to be skipped. + @return the actual number of bytes skipped. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to skip to is corrupted]]> + + + + + + + This method may seek past the end of the file. + This produces no exception and an attempt to read from + the stream will result in -1 indicating the end of the file. + + @param pos the postion to seek to. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to seek to is corrupted]]> + + + + + + + + + + len bytes from + stm + + @param stm an input stream + @param buf destiniation buffer + @param offset offset at which to store data + @param len number of bytes to read + @return actual number of bytes read + @throws IOException if there is any IO error]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + len bytes from the specified byte array + starting at offset off and generate a checksum for + each data chunk. + +

This method stores bytes from the given array into this + stream's buffer before it gets checksumed. The buffer gets checksumed + and flushed to the underlying output stream when all data + in a checksum chunk are in the buffer. If the buffer is empty and + requested length is at least as large as the size of next checksum chunk + size, this method will checksum and write the chunk directly + to the underlying output stream. Thus it avoids uneccessary data copy. + + @param b the data. + @param off the start offset in the data. + @param len the number of bytes to write. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if and only if pathname + should be included]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + trash feature. Files are moved to a user's trash + directory, a subdirectory of their home directory named ".Trash". Files are + initially moved to a current sub-directory of the trash directory. + Within that sub-directory their original path is preserved. Periodically + one may checkpoint the current trash and remove older checkpoints. (This + design permits trash management without enumeration of the full trash + content, without date support in the filesystem, and without clock + synchronization.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} backed by an FTP client provided by Apache Commons Net. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is a tool for migrating data from an older to a newer version + of an S3 filesystem. +

+

+ All files in the filesystem are migrated by re-writing the block metadata + - no datafiles are touched. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + Extracts AWS credentials from the filesystem URI or configuration. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A block-based {@link FileSystem} backed by + Amazon S3. +

+ @see NativeS3FileSystem]]> +
+
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If f is a file, this method will make a single call to S3. + If f is a directory, this method will make a maximum of + (n / 1000) + 2 calls to S3, where n is the total number of + files and directories contained directly in f. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} for reading and writing files stored on + Amazon S3. + Unlike {@link org.apache.hadoop.fs.s3.S3FileSystem} this implementation + stores files on S3 in their + native form so they can be read by other S3 tools. +

+ @see org.apache.hadoop.fs.s3.S3FileSystem]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nth value.]]> + + + + + + + + + + + + + + + + + + + + + nth value in the file.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + public class IntArrayWritable extends ArrayWritable { + public IntArrayWritable() { + super(IntWritable.class); + } + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a ByteWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataInputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using DataInput methods ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataOutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using DataOutput methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + the class of the item + @param conf the configuration to store + @param item the object to be stored + @param keyName the name of the key to use + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param items the objects to be stored + @param keyName the name of the key to use + @throws IndexOutOfBoundsException if the items array is empty + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + DefaultStringifier offers convenience methods to store/load objects to/from + the configuration. + + @param the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a DoubleWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a FloatWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When two sequence files, which have same Key type but different Value + types, are mapped out to reduce, multiple Value types is not allowed. + In this case, this class can help you wrap instances with different types. +

+ +

+ Compared with ObjectWritable, this class is much more effective, + because ObjectWritable will append the class declaration as a String + into the output file in every Key-Value pair. +

+ +

+ Generic Writable implements {@link Configurable} interface, so that it will be + configured by the framework. The configuration is passed to the wrapped objects + implementing {@link Configurable} interface before deserialization. +

+ + how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines + the classes which will be wrapped in GenericObject in application. + Attention: this classes defined in getTypes() method, must + implement Writable interface. +

+ + The code looks like this: +
+ public class GenericObject extends GenericWritable {
+ 
+   private static Class[] CLASSES = {
+               ClassType1.class, 
+               ClassType2.class,
+               ClassType3.class,
+               };
+
+   protected Class[] getTypes() {
+       return CLASSES;
+   }
+
+ }
+ 
+ + @since Nov 8, 2006]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new InputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using InputStream methods ...
+ }
+ 
+ @see DataInputBuffer + @see DataOutput]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a IntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + closes the input and output streams + at the end. + @param in InputStrem to read from + @param out OutputStream to write to + @param conf the Configuration object]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ignore any {@link IOException} or + null pointers. Must only be used for cleanup in exception handlers. + @param log the log to record problems to at debug level. Can be null. + @param closeables the objects to close]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a LongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A map is a directory containing two files, the data file, + containing all keys and values in the map, and a smaller index + file, containing a fraction of the keys. The fraction is determined by + {@link Writer#getIndexInterval()}. + +

The index file is read entirely into memory. Thus key implementations + should try to keep themselves small. + +

Map files are created by adding entries in-order. To maintain a large + database, perform updates by copying the previous version of a database and + merging in a sorted change list, to create a new version of the database in + a new file. Sorting large change lists can be done with {@link + SequenceFile.Sorter}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key and + val. Returns true if such a pair exists and false when at + the end of the map]]> + + + + + + + + + + + + + + + + key or if it does not exist, at the first entry + after the named key. + +- * @param key - key that we're trying to find +- * @param val - data value if key is found +- * @return - the key that was the closest match or null if eof.]]> + + + + + + + + + key does not exist, return + the first entry that falls just before the key. Otherwise, + return the record that sorts just after. + @return - the key that was the closest match or null if eof.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is an MD5Hash whose digest contains the + same values.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new OutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ OutputBuffer buffer = new OutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using OutputStream methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
+ @see DataOutputBuffer + @see InputBuffer]]> +
+
+ + + + + + + + + + + + + + + A {@link Comparator} that operates directly on byte representations of + objects. +

+ @param + @see DeserializerComparator]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SequenceFiles are flat files consisting of binary key/value + pairs. + +

SequenceFile provides {@link Writer}, {@link Reader} and + {@link Sorter} classes for writing, reading and sorting respectively.

+ + There are three SequenceFile Writers based on the + {@link CompressionType} used to compress key/value pairs: +
    +
  1. + Writer : Uncompressed records. +
  2. +
  3. + RecordCompressWriter : Record-compressed files, only compress + values. +
  4. +
  5. + BlockCompressWriter : Block-compressed files, both keys & + values are collected in 'blocks' + separately and compressed. The size of + the 'block' is configurable. +
+ +

The actual compression algorithm used to compress key and/or values can be + specified by using the appropriate {@link CompressionCodec}.

+ +

The recommended way is to use the static createWriter methods + provided by the SequenceFile to chose the preferred format.

+ +

The {@link Reader} acts as the bridge and can read any of the above + SequenceFile formats.

+ +

SequenceFile Formats

+ +

Essentially there are 3 different formats for SequenceFiles + depending on the CompressionType specified. All of them share a + common header described below. + +

+
    +
  • + version - 3 bytes of magic header SEQ, followed by 1 byte of actual + version number (e.g. SEQ4 or SEQ6) +
  • +
  • + keyClassName -key class +
  • +
  • + valueClassName - value class +
  • +
  • + compression - A boolean which specifies if compression is turned on for + keys/values in this file. +
  • +
  • + blockCompression - A boolean which specifies if block-compression is + turned on for keys/values in this file. +
  • +
  • + compression codec - CompressionCodec class which is used for + compression of keys and/or values (if compression is + enabled). +
  • +
  • + metadata - {@link Metadata} for this file. +
  • +
  • + sync - A sync marker to denote end of the header. +
  • +
+ +
Uncompressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Record-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Compressed Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Block-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record Block +
      +
    • Compressed key-lengths block-size
    • +
    • Compressed key-lengths block
    • +
    • Compressed keys block-size
    • +
    • Compressed keys block
    • +
    • Compressed value-lengths block-size
    • +
    • Compressed value-lengths block
    • +
    • Compressed values block-size
    • +
    • Compressed values block
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +

The compressed blocks of key lengths and value lengths consist of the + actual lengths of individual keys/values encoded in ZeroCompressedInteger + format.

+ + @see CompressionCodec]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key, skipping its + value. True if another entry exists, and false at end of file.]]> + + + + + + + + key and + val. Returns true if such a pair exists and false when at + end of file]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The position passed must be a position returned by {@link + SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary + position, use {@link SequenceFile.Reader#sync(long)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SegmentDescriptor + @param segments the list of SegmentDescriptors + @param tmpDir the directory to write temporary files into + @return RawKeyValueIterator + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For best performance, applications should make sure that the {@link + Writable#readFields(DataInput)} implementation of their keys is + very efficient. In particular, it should avoid allocating memory.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This always returns a synchronized position. In other words, + immediately after calling {@link SequenceFile.Reader#seek(long)} with a position + returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However + the key may be earlier in the file than key last written when this + method was called (e.g., with block-compression, it may be the first key + in the block that was being written when this method was called).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key. Returns + true if such a key exists and false when at the end of the set.]]> + + + + + + + key. + Returns key, or null if no match exists.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + position. Note that this + method avoids using the converter or doing String instatiation + @return the Unicode scalar value at position or -1 + if the position is invalid or points to a + trailing byte]]> + + + + + + + + + + what in the backing + buffer, starting as position start. The starting + position is measured in bytes and the return value is in + terms of byte position in the buffer. The backing buffer is + not converted to a string for this operation. + @return byte position of the first occurence of the search + string in the UTF-8 buffer or -1 if not found]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a Text with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException.]]> + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException. + @return ByteBuffer: bytes stores at ByteBuffer.array() + and length is ByteBuffer.limit()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In + addition, it provides methods for string traversal without converting the + byte array to a string.

Also includes utilities for + serializing/deserialing a string, coding/decoding a string, checking if a + byte array contains valid UTF8 code, calculating the length of an encoded + string.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a UTF8 with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + Also includes utilities for efficiently reading and writing UTF-8. + + @deprecated replaced by Text]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This is useful when a class may evolve, so that instances written by the + old version of the class may still be processed by the new version. To + handle this situation, {@link #readFields(DataInput)} + implementations should catch {@link VersionMismatchException}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VIntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VLongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + out. + + @param out DataOuput to serialize this object into. + @throws IOException]]> + + + + + + + in. + +

For efficiency, implementations should attempt to re-use storage in the + existing object where possible.

+ + @param in DataInput to deseriablize this object from. + @throws IOException]]> +
+ + + Any key or value type in the Hadoop Map-Reduce + framework implements this interface.

+ +

Implementations typically implement a static read(DataInput) + method which constructs a new instance, calls {@link #readFields(DataInput)} + and returns the instance.

+ +

Example:

+

+     public class MyWritable implements Writable {
+       // Some data     
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public static MyWritable read(DataInput in) throws IOException {
+         MyWritable w = new MyWritable();
+         w.readFields(in);
+         return w;
+       }
+     }
+ 

]]> +
+ + + + + + + + WritableComparables can be compared to each other, typically + via Comparators. Any type which is to be used as a + key in the Hadoop Map-Reduce framework should implement this + interface.

+ +

Example:

+

+     public class MyWritableComparable implements WritableComparable {
+       // Some data
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public int compareTo(MyWritableComparable w) {
+         int thisValue = this.value;
+         int thatValue = ((IntWritable)o).value;
+         return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+       }
+     }
+ 

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The default implementation reads the data into two {@link + WritableComparable}s (using {@link + Writable#readFields(DataInput)}, then calls {@link + #compare(WritableComparable,WritableComparable)}.]]> + + + + + + + The default implementation uses the natural ordering, calling {@link + Comparable#compareTo(Object)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This base implemenation uses the natural ordering. To define alternate + orderings, override {@link #compare(WritableComparable,WritableComparable)}. + +

One may optimize compare-intensive operations by overriding + {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are + provided to assist in optimized implementations of this method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum type + @param in DataInput to read from + @param enumType Class type of Enum + @return Enum represented by String read from DataInput + @throws IOException]]> + + + + + + + + + + + + + + + + len number of bytes in input streamin + @param in input stream + @param len number of bytes to skip + @throws IOException when skipped less number of bytes]]> + + + + + + + + + + + + + + CompressionCodec for which to get the + Compressor + @return Compressor for the given + CompressionCodec from the pool or a new one]]> + + + + + + CompressionCodec for which to get the + Decompressor + @return Decompressor for the given + CompressionCodec the pool or a new one]]> + + + + + + Compressor to be returned to the pool]]> + + + + + + Decompressor to be returned to the + pool]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Implementations are assumed to be buffered. This permits clients to + reposition the underlying input stream then call {@link #resetState()}, + without having to also synchronize client buffers.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + true if a preset dictionary is needed for decompression. + @return true if a preset dictionary is needed for decompression]]> + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-lzo library is loaded & initialized; + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair. + http://www.oberhumer.com/opensource/lzo/]]> + + + + + + + + + + + + + + + + + + + + + true if lzo compressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if lzo decompressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-zlib is loaded & initialized + and can be loaded for this job, else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keep trying a limited number of times, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+ + + + + + + + Keep trying for a maximum time, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by the number of tries so far. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by a random + number in the range of [0, 2 to the number of retries) +

]]> +
+
+ + + + + + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + + + A retry policy for RemoteException + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + Try once, and fail by re-throwing the exception. + This corresponds to having no retry mechanism in place. +

]]> +
+
+ + + + Try once, and fail silently for void methods, or by + re-throwing the exception for non-void methods. +

]]> +
+
+ + + + Keep trying forever. +

]]> +
+
+ + + A collection of useful implementations of {@link RetryPolicy}. +

]]> +
+
+ + + + + + + + + + Determines whether the framework should retry a + method for the given exception, and the number + of retries that have been made for that operation + so far. +

+ @param e The exception that caused the method to fail. + @param retries The number of times the method has been retried. + @return true if the method should be retried, + false if the method should not be retried + but shouldn't fail with an exception (only for void methods). + @throws Exception The re-thrown exception e indicating + that the method failed and should not be retried further.]]> +
+
+ + + Specifies a policy for retrying method failures. + Implementations of this interface should be immutable. +

]]> +
+
+ + + + + + + + + + + + Create a proxy for an interface of an implementation class + using the same retry policy for each method in the interface. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param retryPolicy the policy for retirying method call failures + @return the retry proxy]]> +
+
+ + + + + + + Create a proxy for an interface of an implementation class + using the a set of retry policies specified by method name. + If no retry policy is defined for a method then a default of + {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param methodNameToPolicyMap a map of method names to retry policies + @return the retry proxy]]> +
+
+ + + A factory for creating retry proxies. +

]]> +
+
+ +
+ + + + + + + + Prepare the deserializer for reading.

]]> +
+
+ + + + + + Deserialize the next object from the underlying input stream. + If the object t is non-null then this deserializer + may set its internal state to the next object read from the input + stream. Otherwise, if the object t is null a new + deserialized object will be created. +

+ @return the deserialized object]]> +
+
+ + + + Close the underlying input stream and clear up any resources.

]]> +
+
+ + + Provides a facility for deserializing objects of type from an + {@link InputStream}. +

+ +

+ Deserializers are stateful, but must not buffer the input since + other producers may read from the input between calls to + {@link #deserialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + A {@link RawComparator} that uses a {@link Deserializer} to deserialize + the objects to be compared so that the standard {@link Comparator} can + be used to compare them. +

+

+ One may optimize compare-intensive operations by using a custom + implementation of {@link RawComparator} that operates directly + on byte representations. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + An experimental {@link Serialization} for Java {@link Serializable} classes. +

+ @see JavaSerializationComparator]]> +
+
+ + + + + + + + + + + + + A {@link RawComparator} that uses a {@link JavaSerialization} + {@link Deserializer} to deserialize objects that are then compared via + their {@link Comparable} interfaces. +

+ @param + @see JavaSerialization]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + Encapsulates a {@link Serializer}/{@link Deserializer} pair. +

+ @param ]]> +
+
+ + + + + + + Serializations are found by reading the io.serializations + property from conf, which is a comma-delimited list of + classnames. +

]]> +
+
+ + + + + + + + + + + + A factory for {@link Serialization}s. +

]]> +
+
+ + + + + + + + Prepare the serializer for writing.

]]> +
+
+ + + + + Serialize t to the underlying output stream.

]]> +
+
+ + + + Close the underlying output stream and clear up any resources.

]]> +
+
+ + + Provides a facility for serializing objects of type to an + {@link OutputStream}. +

+ +

+ Serializers are stateful, but must not buffer the output since + other producers may write to the output between calls to + {@link #serialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param, to the IPC server running at + address, returning the value. Throws exceptions if there are + network problems or if the remote code threw an exception.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unwraps any IOException. + + @param lookupTypes the desired exception class. + @return IOException, which is either the lookupClass exception or this.]]> + + + + + This unwraps any Throwable that has a constructor taking + a String as a parameter. + Otherwise it returns this. + + @return Throwable]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + protocol is a Java interface. All parameters and return types must + be one of: + +
  • a primitive type, boolean, byte, + char, short, int, long, + float, double, or void; or
  • + +
  • a {@link String}; or
  • + +
  • a {@link Writable}; or
  • + +
  • an array of the above types
+ + All methods in the protocol should throw only IOException. No field data of + the protocol instance is transmitted.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + handlerCount determines + the number of handler threads that will be used to process calls.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #rpcQueueTime}.inc(time)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker, + as {@link JobTracker.State} + + @return the current state of the JobTracker.]]> + + + + + + + + + + + + ClusterStatus provides clients with information such as: +

    +
  1. + Size of the cluster. +
  2. +
  3. + Task capacity of the cluster. +
  4. +
  5. + The number of currently running map & reduce tasks. +
  6. +
  7. + State of the JobTracker. +
  8. +

+ +

Clients can query for the latest ClusterStatus, via + {@link JobClient#getClusterStatus()}.

+ + @see JobClient]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter can be of + any {@link Enum} type.

+ +

Counters are bunched into {@link Group}s, each comprising of + counters from a particular Enum class.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of counters, comprising of counters from a particular + counter {@link Enum} class. + +

Grouphandles localization of the class name and the + counter names.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param fs the file system that the file is on + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides a generic implementation of + {@link #getSplits(JobConf, int)}. + Subclasses of FileInputFormat can also override the + {@link #isSplitable(FileSystem, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s.]]> + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the attemptid, say + attempt_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in ${mapred.work.output.dir} during execution + of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the + framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

Note: the value of ${mapred.work.output.dir} during + execution of a particular task-attempt is actually + ${mapred.output.dir}/_temporary/_{$taskid}, and this value is + set by the map-reduce framework. So, just create any side-files in the + path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce + task to take advantage of this feature.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This method is used to validate the input directories when a job is + submitted so that the {@link JobClient} can fail early, with an useful + error message, in case of errors. For e.g. input directory does not exist. +

+ + @param job job configuration. + @throws InvalidInputException if the job does not have valid input + @deprecated getSplits is called in the client and can perform any + necessary validation of the input]]> +
+
+ + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. + + @param job job configuration. + @param numSplits the desired number of splits, a hint. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + It is the responsibility of the RecordReader to respect + record boundaries while processing the logical split to present a + record-oriented view to the individual task.

+ + @param split the {@link InputSplit} + @param job the job that this split belongs to + @return a {@link RecordReader}]]> +
+
+ + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibilty to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see JobClient + @see FileInputFormat]]> + + + + + + + + + + InputSplit. + + @return the number of bytes in the input split. + @throws IOException]]> + + + + + + InputSplit is + located as an array of Strings. + @throws IOException]]> + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + jobid doesn't correspond to any known job. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient is the primary interface for the user-job to interact + with the {@link JobTracker}. + + JobClient provides facilities to submit jobs, track their + progress, access component-tasks' reports/logs, get the Map-Reduce cluster + status information etc. + +

The job submission process involves: +

    +
  1. + Checking the input and output specifications of the job. +
  2. +
  3. + Computing the {@link InputSplit}s for the job. +
  4. +
  5. + Setup the requisite accounting information for the {@link DistributedCache} + of the job, if necessary. +
  6. +
  7. + Copying the job's jar and configuration to the map-reduce system directory + on the distributed file-system. +
  8. +
  9. + Submitting the job to the JobTracker and optionally monitoring + it's status. +
  10. +

+ + Normally the user creates the application, describes various facets of the + job via {@link JobConf} and then uses the JobClient to submit + the job and monitor its progress. + +

Here is an example on how to use JobClient:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     job.setInputPath(new Path("in"));
+     job.setOutputPath(new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+
+     // Submit the job, then poll for progress until the job is complete
+     JobClient.runJob(job);
+ 

+ +

Job Control

+ +

At times clients would chain map-reduce jobs to accomplish complex tasks + which cannot be done via a single map-reduce job. This is fairly easy since + the output of the job, typically, goes to distributed file-system and that + can be used as the input for the next job.

+ +

However, this also means that the onus on ensuring jobs are complete + (success/failure) lies squarely on the clients. In such situations the + various job-control options are: +

    +
  1. + {@link #runJob(JobConf)} : submits the job and returns only after + the job has completed. +
  2. +
  3. + {@link #submitJob(JobConf)} : only submits the job, then poll the + returned handle to the {@link RunningJob} to query status and make + scheduling decisions. +
  4. +
  5. + {@link JobConf#setJobEndNotificationURI(String)} : setup a notification + on job-completion, thus avoiding polling. +
  6. +

+ + @see JobConf + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if framework should keep the intermediate files + for failed tasks, false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Note: +

+ @param dir the {@link Path} of the output directory for the map-reduce job.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the outputs of the maps are to be compressed, + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This comparator should be provided if the equivalence rules for keys + for sorting the intermediates are different from those for grouping keys + before each call to + {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ +

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + in a single call to the reduce function if K1 and K2 compare as equal.

+ +

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + how keys are sorted, this can be used in conjunction to simulate + secondary sort on values.

+ +

Note: This is not a guarantee of the reduce sort being + stable in any sense. (In any case, with the order of available + map-outputs to the reduce being non-deterministic, it wouldn't make + that much sense.)

+ + @param theClass the comparator class to be used for grouping keys. + It should implement RawComparator. + @see #setOutputKeyComparatorClass(Class)]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. Typically the combiner is same as the + the {@link Reducer} for the job i.e. {@link #getReducerClass()}. + + @return the user-defined combiner class used to combine map-outputs.]]> + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. + +

The combiner is a task-level aggregation operation which, in some cases, + helps to cut down the amount of data transferred from the {@link Mapper} to + the {@link Reducer}, leading to better performance.

+ +

Typically the combiner is same as the the Reducer for the + job i.e. {@link #setReducerClass(Class)}.

+ + @param theClass the user-defined combiner class used to combine + map-outputs.]]> +
+
+ + + + + + + + + + + true. + + @return true if speculative execution be used for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on, else false.]]> + + + + + true. + + @return true if speculative execution be + used for this job for map tasks, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for map tasks, + else false.]]> + + + + + true. + + @return true if speculative execution be used + for reduce tasks for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for reduce tasks, + else false.]]> + + + + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + Note: This is only a hint to the framework. The actual + number of spawned map tasks depends on the number of {@link InputSplit}s + generated by the job's {@link InputFormat#getSplits(JobConf, int)}. + + A custom {@link InputFormat} is typically used to accurately control + the number of map tasks for the job.

+ +

How many maps?

+ +

The number of maps is usually driven by the total size of the inputs + i.e. total number of blocks of the input files.

+ +

The right level of parallelism for maps seems to be around 10-100 maps + per-node, although it has been set up to 300 or so for very cpu-light map + tasks. Task setup takes awhile, so it is best if the maps take at least a + minute to execute.

+ +

The default behavior of file-based {@link InputFormat}s is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of input files. However, the {@link FileSystem} blocksize of the + input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Thus, if you expect 10TB of input data and have a blocksize of 128MB, + you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is + used to set it even higher.

+ + @param n the number of map tasks for this job. + @see InputFormat#getSplits(JobConf, int) + @see FileInputFormat + @see FileSystem#getDefaultBlockSize() + @see FileStatus#getBlockSize()]]> +
+
+ + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + How many reduces? + +

The right number of reduces seems to be 0.95 or + 1.75 multiplied by (<no. of nodes> * + + mapred.tasktracker.reduce.tasks.maximum). +

+ +

With 0.95 all of the reduces can launch immediately and + start transfering map outputs as the maps finish. With 1.75 + the faster nodes will finish their first round of reduces and launch a + second wave of reduces doing a much better job of load balancing.

+ +

Increasing the number of reduces increases the framework overhead, but + increases load balancing and lowers the cost of failures.

+ +

The scaling factors above are slightly less than whole numbers to + reserve a few reduce slots in the framework for speculative-tasks, failures + etc.

+ +

Reducer NONE

+ +

It is legal to set the number of reduce-tasks to zero.

+ +

In this case the output of the map-tasks directly go to distributed + file-system, to the path set by + {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the + framework doesn't sort the map-outputs before writing it out to HDFS.

+ + @param n the number of reduce tasks for this job.]]> +
+
+ + + mapred.map.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per map task.]]> + + + + + + + + + + + mapred.reduce.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per reduce task.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noFailures, the + tasktracker is blacklisted for this job. + + @param noFailures maximum no. of failures of a given job per tasktracker.]]> + + + + + blacklisted for this job. + + @return the maximum no. of failures of a given job per tasktracker.]]> + + + + + failed. + + Defaults to zero, i.e. any failed map-task results in + the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + failed. + + Defaults to zero, i.e. any failed reduce-task results + in the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The debug script can aid debugging of failed map tasks. The script is + given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script needs to be symlinked.

+ +

Here is an example on how to submit a script +

+ job.setMapDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param mDbgScript the script name]]> +
+
+ + + + + + + + + The debug script can aid debugging of failed reduce tasks. The script + is given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script file needs to be symlinked

+ +

Here is an example on how to submit a script +

+ job.setReduceDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param rDbgScript the script name]]> +
+
+ + + + + + + + null if it hasn't + been set. + @see #setJobEndNotificationURI(String)]]> + + + + + + The uri can contain 2 special parameters: $jobId and + $jobStatus. Those, if present, are replaced by the job's + identifier and completion-status respectively.

+ +

This is typically used by application-writers to implement chaining of + Map-Reduce jobs in an asynchronous manner.

+ + @param uri the job end notification uri + @see JobStatus + @see Job Completion and Chaining]]> +
+
+ + + + When a job starts, a shared directory is created at location + + ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ . + This directory is exposed to the users through + job.local.dir . + So, the tasks can use this space + as scratch space and share files among them.

+ This value is available as System property also. + + @return The localized job specific shared directory]]> +
+
+ + JobConf is the primary interface for a user to describe a + map-reduce job to the Hadoop framework for execution. The framework tries to + faithfully execute the job as-is described by JobConf, however: +
    +
  1. + Some configuration parameters might have been marked as + + final by administrators and hence cannot be altered. +
  2. +
  3. + While some job parameters are straight-forward to set + (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly + rest of the framework and/or job-configuration and is relatively more + complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}). +
  4. +

+ +

JobConf typically specifies the {@link Mapper}, combiner + (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and + {@link OutputFormat} implementations to be used etc. + +

Optionally JobConf is used to specify other advanced facets + of the job such as Comparators to be used, files to be put in + the {@link DistributedCache}, whether or not intermediate and/or job outputs + are to be compressed (and how), debugability via user-provided scripts + ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), + for doing post-processing on task logs, task's stdout, stderr, syslog. + and etc.

+ +

Here is an example on how to configure a job via JobConf:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     FileInputFormat.setInputPaths(job, new Path("in"));
+     FileOutputFormat.setOutputPath(job, new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setCombinerClass(MyJob.MyReducer.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+     
+     job.setInputFormat(SequenceFileInputFormat.class);
+     job.setOutputFormat(SequenceFileOutputFormat.class);
+ 

+ + @see JobClient + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + any job + run on the jobtracker started at 200707121733, we would use : +
 
+ JobID.getTaskIDsPattern("200707121733", null);
+ 
+ which will return : +
 "job_200707121733_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @return a regex pattern matching JobIDs]]> +
+
+ + + An example JobID is : + job_200707121733_0003 , which represents the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse JobID strings, but rather + use appropriate constructors or {@link #forName(String)} method. + + @see TaskID + @see TaskAttemptID + @see JobTracker#getNewJobId() + @see JobTracker#getStartTime()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -archives + -files inputjar args]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + zero. + + @param conf configuration for the JobTracker. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + io.file.buffer.size specified in the given + Configuration. + @param in input stream + @param conf configuration + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Output pairs need not be of the same types as input pairs. A given + input pair may map to zero or many output pairs. Output pairs are + collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the input key. + @param value the input value. + @param output collects mapped keys and values. + @param reporter facility to report progress.]]> +
+ + + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link JobConf} for the + job via the {@link JobConfigurable#configure(JobConf)} and initialize + themselves. Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

The framework then calls + {@link #map(Object, Object, OutputCollector, Reporter)} + for each key/value pair in the InputSplit for that task.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the grouping by specifying + a Comparator via + {@link JobConf#setOutputKeyComparatorClass(Class)}.

+ +

The grouped Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

The intermediate, grouped outputs are always stored in + {@link SequenceFile}s. Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the JobConf.

+ +

If the job has + zero + reduces then the output of the Mapper is directly written + to the {@link FileSystem} without grouping by keys.

+ +

Example:

+

+     public class MyMapper<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Mapper<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+       
+       private String mapTaskId;
+       private String inputFile;
+       private int noRecords = 0;
+       
+       public void configure(JobConf job) {
+         mapTaskId = job.get("mapred.task.id");
+         inputFile = job.get("mapred.input.file");
+       }
+       
+       public void map(K key, V val,
+                       OutputCollector<K, V> output, Reporter reporter)
+       throws IOException {
+         // Process the <key, value> pair (assume this takes a while)
+         // ...
+         // ...
+         
+         // Let the framework know that we are alive, and kicking!
+         // reporter.progress();
+         
+         // Process some more
+         // ...
+         // ...
+         
+         // Increment the no. of <key, value> pairs processed
+         ++noRecords;
+
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+        
+         // Every 100 records update application-level status
+         if ((noRecords%100) == 0) {
+           reporter.setStatus(mapTaskId + " processed " + noRecords + 
+                              " from input-file: " + inputFile); 
+         }
+         
+         // Output the result
+         output.collect(key, val);
+       }
+     }
+ 

+ +

Applications may write a custom {@link MapRunnable} to exert greater + control on map processing e.g. multi-threaded Mappers etc.

+ + @see JobConf + @see InputFormat + @see Partitioner + @see Reducer + @see MapReduceBase + @see MapRunnable + @see SequenceFile]]> +
+
+ + + + + + + + + + + + + + + + + + + + + Provides default no-op implementations for a few methods, most non-trivial + applications need to override some of them.

]]> +
+
+ + + + + + + + + + + <key, value> pairs. + +

Mapping of input records to output records is complete when this method + returns.

+ + @param input the {@link RecordReader} to read the input records. + @param output the {@link OutputCollector} to collect the outputrecords. + @param reporter {@link Reporter} to report progress, status-updates etc. + @throws IOException]]> +
+
+ + Custom implementations of MapRunnable can exert greater + control on map processing e.g. multi-threaded, asynchronous mappers etc.

+ + @see Mapper]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly + equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)} + to construct RecordReader's for MultiFileSplit's. + @see MultiFileSplit]]> +
+
+ + + + + + + + + + + + + + + + + th Path]]> + + + + + + + + + + + th Path]]> + + + + + + + + + + + + + + + + + + + + + + + MultiFileSplit can be used to implement {@link RecordReader}'s, with + reading one record per file. + @see FileSplit + @see MultiFileInputFormat]]> + + + + + + + + + + + + + + + <key, value> pairs output by {@link Mapper}s + and {@link Reducer}s. + +

OutputCollector is the generalization of the facility + provided by the Map-Reduce framework to collect data output by either the + Mapper or the Reducer i.e. intermediate outputs + or the output of the job.

]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param ignored + @param job job configuration. + @throws IOException when output should not be attempted]]> +
+
+ + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter + @see JobConf]]> +
+
+ + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be paritioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0 to 1.0. + @throws IOException]]> + + + + RecordReader reads <key, value> pairs from an + {@link InputSplit}. + +

RecordReader, typically, converts the byte-oriented view of + the input, provided by the InputSplit, and presents a + record-oriented view for the {@link Mapper} & {@link Reducer} tasks for + processing. It thus assumes the responsibility of processing record + boundaries and presenting the tasks with keys and values.

+ + @see InputSplit + @see InputFormat]]> +
+
+ + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param reporter facility to report progress. + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + Reduces values for a given key. + +

The framework calls this method for each + <key, (list of values)> pair in the grouped inputs. + Output values must be of the same type as input values. Input keys must + not be altered. The framework will reuse the key and value objects + that are passed into the reduce, therefore the application should clone + the objects they want to keep a copy of. In many cases, all values are + combined into zero or one value. +

+ +

Output pairs are collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the key. + @param values the list of values to reduce. + @param output to collect keys and combined values. + @param reporter facility to report progress.]]> +
+ + + The number of Reducers for the job is set by the user via + {@link JobConf#setNumReduceTasks(int)}. Reducer implementations + can access the {@link JobConf} for the job via the + {@link JobConfigurable#configure(JobConf)} method and initialize themselves. + Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    Reducer is input the grouped output of a {@link Mapper}. + In the phase the framework, for each Reducer, fetches the + relevant partition of the output of all the Mappers, via HTTP. +

    +
  2. + +
  3. +

    Sort

    + +

    The framework groups Reducer inputs by keys + (since different Mappers may have output the same key) in this + stage.

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    If equivalence rules for keys while grouping the intermediates are + different from those for grouping keys before reduction, then one may + specify a Comparator via + {@link JobConf#setOutputValueGroupingComparator(Class)}.Since + {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to + control how intermediate keys are grouped, these can be used in conjunction + to simulate secondary sort on values.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterator, OutputCollector, Reporter)} + method is called for each <key, (list of values)> pair in + the grouped inputs.

    +

    The output of the reduce task is typically written to the + {@link FileSystem} via + {@link OutputCollector#collect(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+     public class MyReducer<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Reducer<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+        
+       private String reduceTaskId;
+       private int noKeys = 0;
+       
+       public void configure(JobConf job) {
+         reduceTaskId = job.get("mapred.task.id");
+       }
+       
+       public void reduce(K key, Iterator<V> values,
+                          OutputCollector<K, V> output, 
+                          Reporter reporter)
+       throws IOException {
+       
+         // Process
+         int noValues = 0;
+         while (values.hasNext()) {
+           V value = values.next();
+           
+           // Increment the no. of values for this key
+           ++noValues;
+           
+           // Process the <key, value> pair (assume this takes a while)
+           // ...
+           // ...
+           
+           // Let the framework know that we are alive, and kicking!
+           if ((noValues%10) == 0) {
+             reporter.progress();
+           }
+         
+           // Process some more
+           // ...
+           // ...
+           
+           // Output the <key, value> 
+           output.collect(key, value);
+         }
+         
+         // Increment the no. of <key, list of values> pairs processed
+         ++noKeys;
+         
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+         
+         // Every 100 keys update application-level status
+         if ((noKeys%100) == 0) {
+           reporter.setStatus(reduceTaskId + " processed " + noKeys);
+         }
+       }
+     }
+ 

+ + @see Mapper + @see Partitioner + @see Reporter + @see MapReduceBase]]> +
+
+ + + + + + + + + + + + + + + Enum. + @param amount A non-negative amount by which the counter is to + be incremented.]]> + + + + + + + + + + + + + + InputSplit that the map is reading from. + @throws UnsupportedOperationException if called outside a mapper]]> + + + + + + + + + {@link Mapper} and {@link Reducer} can use the Reporter + provided to report progress or just indicate that they are alive. In + scenarios where the application takes an insignificant amount of time to + process individual key/value pairs, this is crucial since the framework + might assume that the task has timed-out and kill that task. + +

Applications can also update {@link Counters} via the provided + Reporter .

+ + @see Progressable + @see Counters]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RunningJob is the user-interface to query for details on a + running Map-Reduce job. + +

Clients can get hold of RunningJob via the {@link JobClient} + and then query the running-job for details such as name, configuration, + progress etc.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This allows the user to specify the key class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + This allows the user to specify the value class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f. The filtering criteria is + MD5(key) % f == 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + f using + the criteria record# % f == 0. + For example, if the frequency is 10, one out of 10 records is returned.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + @param name The name of the server + @param port The port to use on the server + @param findPort whether the server should start at the given port and + increment by 1 until it finds a free port.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points to the log directory + "/static/" -> points to common static files (src/webapps/static) + "/" -> the jsp server code from (src/webapps/)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all task attempt IDs + of any jobtracker, in any job, of the first + map task, we would use : +
 
+ TaskAttemptID.getTaskAttemptIDsPattern(null, null, true, 1, null);
+ 
+ which will return : +
 "attempt_[^_]*_[0-9]*_m_000001_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @param attemptId the task attempt number, or null + @return a regex pattern matching TaskAttemptIDs]]> +
+
+ + + An example TaskAttemptID is : + attempt_200707121733_0003_m_000005_0 , which represents the + zeroth task attempt for the fifth map task in the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse TaskAttemptID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the first map task + of any jobtracker, of any job, we would use : +

 
+ TaskID.getTaskIDsPattern(null, null, true, 1);
+ 
+ which will return : +
 "task_[^_]*_[0-9]*_m_000001*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @return a regex pattern matching TaskIDs]]> +
+ + + + An example TaskID is : + task_200707121733_0003_m_000005 , which represents the + fifth map task in the third job running at the jobtracker + started at 200707121733. +

+ Applications should never construct or parse TaskID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.log.dir.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the Job was added.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ([,]*) + func ::= tbl(,"") + class ::= @see java.lang.Class#forName(java.lang.String) + path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) + } + Reads expression from the mapred.join.expr property and + user-supplied join types from mapred.join.define.<ident> + types. Paths supplied to tbl are given as input paths to the + InputFormat class listed. + @see #compose(java.lang.String, java.lang.Class, java.lang.String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ,

) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + mapred.join.define.<ident> to a classname. In the expression + mapred.join.expr, the identifier will be assumed to be a + ComposableRecordReader. + mapred.join.keycomparator can be a classname used to compare keys + in the join. + @see JoinRecordReader + @see MultiFilterRecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + capacity children to position + id in the parent reader. + The id of a root CompositeRecordReader is -1 by convention, but relying + on this is not recommended.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + override(S1,S2,S3) will prefer values + from S3 over S2, and values from S2 over S1 for all keys + emitted from all sources.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [,,...,]]]> + + + + + + + out. + TupleWritable format: + {@code + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Map implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured to use this MapRunnable class (using + the JobConf.setMapRunnerClass method) and + the number of thread the thread-pool can use with the + mapred.map.multithreadedrunner.threads property, its default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pairs. Uses + {@link StringTokenizer} to break text into tokens.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + generateKeyValPairs(Object key, Object value); public void + configure(JobConfjob); } + + The package also provides a base class, ValueAggregatorBaseDescriptor, + implementing the above interface. The user can extend the base class and + implement generateKeyValPairs accordingly. + + The primary work of generateKeyValPairs is to emit one or more key/value + pairs based on the input key/value pair. The key in an output key/value pair + encode two pieces of information: aggregation type and aggregation id. The + value will be aggregated onto the aggregation id according the aggregation + type. + + This class offers a function to generate a map/reduce job using Aggregate + framework. The function takes the following parameters: input directory spec + input format (text or sequence file) output directory a file specifying the + user plugin class]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When constructing the instance, if the factory property + contextName.class exists, + its value is taken to be the name of the class to instantiate. Otherwise, + the default is to create an instance of + org.apache.hadoop.metrics.spi.NullContext, which is a + dummy "no-op" context which will cause all metric data to be discarded. + + @param contextName the name of the context + @return the named MetricsContext]]> + + + + + + + + + + + + + + When the instance is constructed, this method checks if the file + hadoop-metrics.properties exists on the class path. If it + exists, it must be in the format defined by java.util.Properties, and all + the properties in the file are set as attributes on the newly created + ContextFactory instance. + + @return the singleton ContextFactory instance]]> + + + + getFactory() method.]]> + + + + + + + + + + + + + + + + + + + startMonitoring() again after calling + this. + @see #close()]]> + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record name identifies the kind of data to be reported. For example, a + program reporting statistics relating to the disks on a computer might use + a record name "diskStats".

+ + A record has zero or more tags. A tag has a name and a value. To + continue the example, the "diskStats" record might use a tag named + "diskName" to identify a particular disk. Sometimes it is useful to have + more than one tag, so there might also be a "diskType" with value "ide" or + "scsi" or whatever.

+ + A record also has zero or more metrics. These are the named + values that are to be reported to the metrics system. In the "diskStats" + example, possible metric names would be "diskPercentFull", "diskPercentBusy", + "kbReadPerSecond", etc.

+ + The general procedure for using a MetricsRecord is to fill in its tag and + metric values, and then call update() to pass the record to the + client library. + Metric data is not immediately sent to the metrics system + each time that update() is called. + An internal table is maintained, identified by the record name. This + table has columns + corresponding to the tag and the metric names, and rows + corresponding to each unique set of tag values. An update + either modifies an existing row in the table, or adds a new row with a set of + tag values that are different from all the other rows. Note that if there + are no tags, then there can be at most one row in the table.

+ + Once a row is added to the table, its data will be sent to the metrics system + on every timer period, whether or not it has been updated since the previous + timer period. If this is inappropriate, for example if metrics were being + reported by some transient object in an application, the remove() + method can be used to remove the row and thus stop the data from being + sent.

+ + Note that the update() method is atomic. This means that it is + safe for different threads to be updating the same metric. More precisely, + it is OK for different threads to call update() on MetricsRecord instances + with the same set of tag names and tag values. Different threads should + not use the same MetricsRecord instance at the same time.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MetricsContext.registerUpdater().]]> + + + + + + + + + + + + + + + + + + + + + + + + + fileName attribute, + if specified. Otherwise the data will be written to standard + output.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is configured by setting ContextFactory attributes which in turn + are usually configured through a properties file. All the attributes are + prefixed by the contextName. For example, the properties file might contain: +

+ myContextName.fileName=/tmp/metrics.log
+ myContextName.period=5
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contextName.tableName. The returned map consists of + those attributes with the contextName and tableName stripped off.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class implements the internal table of metric data, and the timer + on which data is to be sent to the metrics system. Subclasses must + override the abstract emitRecord method in order to transmit + the data.

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + update + and remove().]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hostname or hostname:port. If + the specs string is null, defaults to localhost:defaultPort. + + @return a list of InetSocketAddress objects.]]> + + + + + + + + + + + + + + + + + + + ,name=" + Where the and are the supplied parameters + + @param serviceName + @param nameName + @param theMbean - the MBean to register + @return the named used to register the MBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.rpc.socket.factory.class.<ClassName>. When no + such parameter exists then fall back on the default socket factory as + configured by hadoop.rpc.socket.factory.class.default. If + this default socket factory is not configured, then fall back on the JVM + default socket factory. + + @param conf the configuration + @param clazz the class (usually a {@link VersionedProtocol}) + @return a socket factory]]> + + + + + + hadoop.rpc.socket.factory.default + + @param conf the configuration + @return the default socket factory as specified in the configuration or + the JVM default socket factory if the configuration does not + contain a default socket factory property.]]> + + + + + + + + + + + + + : + ://:/]]> + + + + + + + + : + ://:/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketInputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getInputStream()} is returned. In the later + case, the timeout argument is ignored and the timeout set with + {@link Socket#setSoTimeout(int)} applies for reads.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see #getInputStream(Socket, long) + + @param socket + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + +
+ + From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketOutputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getOutputStream()} is returned. In the later + case, the timeout argument is ignored and the write will wait until + data is available.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see #getOutputStream(Socket, long) + + @param socket + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+
+ + + + + + + + + + + + + + + + + + + + + node + + @param node + a node + @return true if node is already in the tree; false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scope + if scope starts with ~, choose one from the all nodes except for the + ones in scope; otherwise, choose one from scope + @param scope range of nodes from which a node will be choosen + @return the choosen node]]> + + + + + + + scope but not in excludedNodes + if scope starts with ~, return the number of nodes that are not + in scope and excludedNodes; + @param scope a path string that may start with ~ + @param excludedNodes a list of nodes + @return number of available nodes]]> + + + + + + + + + + + + reader + It linearly scans the array, if a local node is found, swap it with + the first element of the array. + If a local rack node is found, swap it with the first element following + the local node. + If neither local node or local rack node is found, put a random replica + location at postion 0. + It leaves the rest nodes untouched.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + +
+ + Create a new ouput stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = getCount(). + @param newCapacity The new capacity in bytes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index idx = startVector(...); + while (!idx.done()) { + .... // read element of a vector + idx.incr(); + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This task takes the given record definition files and compiles them into + java or c++ + files. It is then up to the user to compile the generated files. + +

The task requires the file or the nested fileset element to be + specified. Optional attributes are language (set the output + language, default is "java"), + destdir (name of the destination directory for generated java/c++ + code, default is ".") and failonerror (specifies error handling + behavior. default is true). +

Usage

+
+ <recordcc
+       destdir="${basedir}/gensrc"
+       language="java">
+   <fileset include="**\/*.jr" />
+ </recordcc>
+ 
]]> +
+
+ +
+ + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi as a comma separated string in + conf as a property attr + + The String starts with the user name followed by the default group names, + and other group names. + + @param conf configuration + @param attr property name + @param ugi a UnixUserGroupInformation]]> + + + + + + + + conf + + The object is expected to store with the property name attr + as a comma separated string that starts + with the user name followed by group names. + If the property name is not defined, return null. + It's assumed that there is only one UGI per user. If this user already + has a UGI in the ugi map, return the ugi in the map. + Otherwise, construct a UGI from the configuration, store it in the + ugi map and return it. + + @param conf configuration + @param attr property name + @return a UnixUGI + @throws LoginException if the stored string is ill-formatted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool supports archiving and anaylzing (sort/grep) of log-files. + It takes as input + a) Input uri which will serve uris of the logs to be archived. + b) Output directory (not mandatory). + b) Directory on dfs to archive the logs. + c) The sort/grep patterns for analyzing the files and separator for boundaries. + Usage: + Logalyzer -archive -archiveDir -analysis -logs -grep -sort -separator +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GenericOptionsParser to parse only the generic Hadoop + arguments. + + The array of string arguments other than the generic arguments can be + obtained by {@link #getRemainingArgs()}. + + @param conf the Configuration to modify. + @param args command-line arguments.]]> + + + + + GenericOptionsParser to parse given options as well + as generic Hadoop options. + + The resulting CommandLine object can be obtained by + {@link #getCommandLine()}. + + @param conf the configuration to modify + @param options options built by the caller + @param args User-specified arguments]]> + + + + + Strings containing the un-parsed arguments + or empty array if commandLine was not defined.]]> + + + + + CommandLine object + to process the parsed arguments. + + Note: If the object is created with + {@link #GenericOptionsParser(Configuration, String[])}, then returned + object will only contain parsed generic options. + + @return CommandLine representing list of arguments + parsed against Options descriptor.]]> + + + + + + + + + + GenericOptionsParser is a utility to parse command line + arguments generic to the Hadoop framework. + + GenericOptionsParser recognizes several standarad command + line arguments, enabling applications to easily specify a namenode, a + jobtracker, additional configuration resources etc. + +

Generic Options

+ +

The supported generic options are:

+

+     -conf <configuration file>     specify a configuration file
+     -D <property=value>            use value for given property
+     -fs <local|namenode:port>      specify a namenode
+     -jt <local|jobtracker:port>    specify a job tracker
+     -files <comma separated list of files>    specify comma separated
+                            files to be copied to the map reduce cluster
+     -libjars <comma separated list of jars>   specify comma separated
+                            jar files to include in the classpath.
+     -archives <comma separated list of archives>    specify comma
+             separated archives to be unarchived on the compute machines.
+
+ 

+ +

The general command line syntax is:

+

+ bin/hadoop command [genericOptions] [commandOptions]
+ 

+ +

Generic command line arguments might modify + Configuration objects, given to constructors.

+ +

The functionality is implemented using Commons CLI.

+ +

Examples:

+

+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+ 
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+     
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+     
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+ 
+ $ bin/hadoop jar -libjars testlib.jar 
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+ 

+ + @see Tool + @see ToolRunner]]> +
+
+ + + + + + + + + Class<T>) of the + argument of type T. + @param The type of the argument + @param t the object to get it class + @return Class<T>]]> + + + + + + + List<T> to a an array of + T[]. + @param c the Class object of the items in the list + @param list the list to convert]]> + + + + + + List<T> to a an array of + T[]. + @param list the list to convert + @throws ArrayIndexOutOfBoundsException if the list is empty. + Use {@link #toArray(Class, List)} if the list may be empty.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-hadoop is loaded, + else false]]> + + + + + + true if native hadoop libraries, if present, can be + used for this job; false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { pq.top().change(); pq.adjustTop(); } + instead of
+  { o = pq.pop(); o.change(); pq.push(o); }
+ 
]]> +
+
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clients and/or applications can use the provided Progressable + to explicitly report progress to the Hadoop framework. This is especially + important for operations which take an insignificant amount of time since, + in-lieu of the reported progress, the framework has to assume that an error + has occured and time-out the operation.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Class is to be obtained + @return the correctly typed Class of the given object.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop Pipes + or Hadoop Streaming. + + It also checks to ensure that we are running on a *nix platform else + (e.g. in Cygwin/Windows) it returns null. + @param job job configuration + @return a String[] with the ulimit command arguments or + null if we are running on a non *nix platform or + if the limit is unspecified.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell interface. + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell can be used to run unix commands like du or + df. It also offers facilities to gate commands by + time-intervals.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ShellCommandExecutorshould be used in cases where the output + of the command needs no explicit parsing and where the command, working + directory and the environment remains unchanged. The output of the command + is stored as-is and is expected to be small.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ArrayList of string values]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the char to be escaped + @return an escaped string]]> + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the escaped char + @return an unescaped string]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tool, is the standard for any Map-Reduce tool/application. + The tool/application should delegate the handling of + + standard command-line options to {@link ToolRunner#run(Tool, String[])} + and only handle its custom arguments.

+ +

Here is how a typical Tool is implemented:

+

+     public class MyApp extends Configured implements Tool {
+     
+       public int run(String[] args) throws Exception {
+         // Configuration processed by ToolRunner
+         Configuration conf = getConf();
+         
+         // Create a JobConf using the processed conf
+         JobConf job = new JobConf(conf, MyApp.class);
+         
+         // Process custom command-line options
+         Path in = new Path(args[1]);
+         Path out = new Path(args[2]);
+         
+         // Specify various job-specific parameters     
+         job.setJobName("my-app");
+         job.setInputPath(in);
+         job.setOutputPath(out);
+         job.setMapperClass(MyApp.MyMapper.class);
+         job.setReducerClass(MyApp.MyReducer.class);
+
+         // Submit the job, then poll for progress until the job is complete
+         JobClient.runJob(job);
+       }
+       
+       public static void main(String[] args) throws Exception {
+         // Let ToolRunner handle generic command-line options 
+         int res = ToolRunner.run(new Configuration(), new Sort(), args);
+         
+         System.exit(res);
+       }
+     }
+ 

+ + @see GenericOptionsParser + @see ToolRunner]]> +
+
+ + + + + + + + + + + + Tool by {@link Tool#run(String[])}, after + parsing with the given generic arguments. Uses the given + Configuration, or builds one if null. + + Sets the Tool's configuration with the possibly modified + version of the conf. + + @param conf Configuration for the Tool. + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + Tool with its Configuration. + + Equivalent to run(tool.getConf(), tool, args). + + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + + + ToolRunner can be used to run classes implementing + Tool interface. It works in conjunction with + {@link GenericOptionsParser} to parse the + + generic hadoop command line arguments and modifies the + Configuration of the Tool. The + application-specific options are passed along without being modified. +

+ + @see Tool + @see GenericOptionsParser]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
diff --git a/lib/jdiff/hadoop_0.19.0.xml b/lib/jdiff/hadoop_0.19.0.xml new file mode 100644 index 00000000000..557ac3cc598 --- /dev/null +++ b/lib/jdiff/hadoop_0.19.0.xml @@ -0,0 +1,43972 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + final. + + @param name resource to be added, the classpath is examined for a file + with that name.]]> + + + + + + final. + + @param url url of the resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param file file-path of resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param in InputStream to deserialize the object from.]]> + + + + + + + + + + + name property, null if + no such property exists. + + Values are processed for variable expansion + before being returned. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + name property, without doing + variable expansion. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + + value of the name property. + + @param name property name. + @param value property value.]]> + + + + + + + name property. If no such property + exists, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value, or defaultValue if the property + doesn't exist.]]> + + + + + + + name property as an int. + + If no such property exists, or if the specified value is not a valid + int, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as an int, + or defaultValue.]]> + + + + + + + name property to an int. + + @param name property name. + @param value int value of the property.]]> + + + + + + + name property as a long. + If no such property is specified, or if the specified value is not a valid + long, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a long, + or defaultValue.]]> + + + + + + + name property to a long. + + @param name property name. + @param value long value of the property.]]> + + + + + + + name property as a float. + If no such property is specified, or if the specified value is not a valid + float, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a float, + or defaultValue.]]> + + + + + + + name property as a boolean. + If no such property is specified, or if the specified value is not a valid + boolean, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a boolean, + or defaultValue.]]> + + + + + + + name property to a boolean. + + @param name property name. + @param value boolean value of the property.]]> + + + + + + + + + + + + + name property as + a collection of Strings. + If no such property is specified then empty collection is returned. +

+ This is an optimized version of {@link #getStrings(String)} + + @param name property name. + @return property value as a collection of Strings.]]> + + + + + + name property as + an array of Strings. + If no such property is specified then null is returned. + + @param name property name. + @return property value as an array of Strings, + or null.]]> + + + + + + + name property as + an array of Strings. + If no such property is specified then default value is returned. + + @param name property name. + @param defaultValue The default value + @return property value as an array of Strings, + or default value.]]> + + + + + + + name property as + as comma delimited values. + + @param name property name. + @param values The values]]> + + + + + + + + + + + + + + name property + as an array of Class. + The value of the property specifies a list of comma separated class names. + If no such property is specified, then defaultValue is + returned. + + @param name the property name. + @param defaultValue default value. + @return property value as a Class[], + or defaultValue.]]> + + + + + + + name property as a Class. + If no such property is specified, then defaultValue is + returned. + + @param name the class name. + @param defaultValue default value. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property as a Class + implementing the interface specified by xface. + + If no such property is specified, then defaultValue is + returned. + + An exception is thrown if the returned class does not implement the named + interface. + + @param name the class name. + @param defaultValue default value. + @param xface the interface implemented by the named class. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property to the name of a + theClass implementing the given interface xface. + + An exception is thrown if theClass does not implement the + interface xface. + + @param name property name. + @param theClass property value. + @param xface the interface implemented by the named class.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + + + + + name. + + @param name configuration resource name. + @return an input stream attached to the resource.]]> + + + + + + name. + + @param name configuration resource name. + @return a reader attached to the resource.]]> + + + + + + + + + + + + + + + String + key-value pairs in the configuration. + + @return an iterator over the entries.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + true to set quiet-mode on, false + to turn it off.]]> + + + + + + + + + + + + + + + + + + + Resources + +

Configurations are specified by resources. A resource contains a set of + name/value pairs as XML data. Each resource is named by either a + String or by a {@link Path}. If named by a String, + then the classpath is examined for a file with that name. If named by a + Path, then the local filesystem is examined directly, without + referring to the classpath. + +

Unless explicitly turned off, Hadoop by default specifies two + resources, loaded in-order from the classpath:

    +
  1. hadoop-default.xml + : Read-only defaults for hadoop.
  2. +
  3. hadoop-site.xml: Site-specific configuration for a given hadoop + installation.
  4. +
+ Applications may add additional resources, which are loaded + subsequent to these resources in the order they are added. + +

Final Parameters

+ +

Configuration parameters may be declared final. + Once a resource declares a value final, no subsequently-loaded + resource can alter that value. + For example, one might define a final parameter with: +

+  <property>
+    <name>dfs.client.buffer.dir</name>
+    <value>/tmp/hadoop/dfs/client</value>
+    <final>true</final>
+  </property>
+ + Administrators typically define parameters as final in + hadoop-site.xml for values that user applications may not alter. + +

Variable Expansion

+ +

Value strings are first processed for variable expansion. The + available properties are:

    +
  1. Other properties defined in this Configuration; and, if a name is + undefined here,
  2. +
  3. Properties in {@link System#getProperties()}.
  4. +
+ +

For example, if a configuration resource contains the following property + definitions: +

+  <property>
+    <name>basedir</name>
+    <value>/user/${user.name}</value>
+  </property>
+  
+  <property>
+    <name>tempdir</name>
+    <value>${basedir}/tmp</value>
+  </property>
+ + When conf.get("tempdir") is called, then ${basedir} + will be resolved to another property in this Configuration, while + ${user.name} would then ordinarily be resolved to the value + of the System property with that name.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DistributedCache is a facility provided by the Map-Reduce + framework to cache files (text, archives, jars etc.) needed by applications. +

+ +

Applications specify the files, via urls (hdfs:// or http://) to be cached + via the {@link org.apache.hadoop.mapred.JobConf}. + The DistributedCache assumes that the + files specified via hdfs:// urls are already present on the + {@link FileSystem} at the path specified by the url.

+ +

The framework will copy the necessary files on to the slave node before + any tasks for the job are executed on that node. Its efficiency stems from + the fact that the files are only copied once per job and the ability to + cache archives which are un-archived on the slaves.

+ +

DistributedCache can be used to distribute simple, read-only + data/text files and/or more complex types such as archives, jars etc. + Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. + Jars may be optionally added to the classpath of the tasks, a rudimentary + software distribution mechanism. Files have execution permissions. + Optionally users can also direct it to symlink the distributed cache file(s) + into the working directory of the task.

+ +

DistributedCache tracks modification timestamps of the cache + files. Clearly the cache files should not be modified by the application + or externally while the job is executing.

+ +

Here is an illustrative example on how to use the + DistributedCache:

+

+     // Setting up the cache for the application
+     
+     1. Copy the requisite files to the FileSystem:
+     
+     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
+     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
+     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+     
+     2. Setup the application's JobConf:
+     
+     JobConf job = new JobConf();
+     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
+                                   job);
+     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+     
+     3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+     or {@link org.apache.hadoop.mapred.Reducer}:
+     
+     public static class MapClass extends MapReduceBase  
+     implements Mapper<K, V, K, V> {
+     
+       private Path[] localArchives;
+       private Path[] localFiles;
+       
+       public void configure(JobConf job) {
+         // Get the cached archives/files
+         localArchives = DistributedCache.getLocalCacheArchives(job);
+         localFiles = DistributedCache.getLocalCacheFiles(job);
+       }
+       
+       public void map(K key, V value, 
+                       OutputCollector<K, V> output, Reporter reporter) 
+       throws IOException {
+         // Use data from the cached archives/files here
+         // ...
+         // ...
+         output.collect(k, v);
+       }
+     }
+     
+ 

+ + @see org.apache.hadoop.mapred.JobConf + @see org.apache.hadoop.mapred.JobClient]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BufferedFSInputStream + with the specified buffer size, + and saves its argument, the input stream + in, for later use. An internal + buffer array of length size + is created and stored in buf. + + @param in the underlying input stream. + @param size the buffer size. + @exception IllegalArgumentException if size <= 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + setReplication of FileSystem + @param src file name + @param replication new replication + @throws IOException + @return true if successful; + false if file does not exist or is a directory]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ' + @deprecated Consider using {@link GenericOptionsParser} instead.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fs.scheme.class whose value names the FileSystem class. + The entire URI is passed to the FileSystem instance's initialize method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Return all the files that match filePattern and are not checksum + files. Results are sorted by their names. + +

+ A filename pattern is composed of regular characters and + special pattern matching characters, which are: + +

+
+
+

+

? +
Matches any single character. + +

+

* +
Matches zero or more characters. + +

+

[abc] +
Matches a single character from character set + {a,b,c}. + +

+

[a-b] +
Matches a single character from the character range + {a...b}. Note that character a must be + lexicographically less than or equal to character b. + +

+

[^a] +
Matches a single character that is not from character set or range + {a}. Note that the ^ character must occur + immediately to the right of the opening bracket. + +

+

\c +
Removes (escapes) any special meaning of character c. + +

+

{ab,cd} +
Matches a string from the string set {ab, cd} + +

+

{ab,c{de,fh}} +
Matches a string from the string set {ab, cde, cfh} + +
+
+
+ + @param pathPattern a regular expression specifying a pth pattern + + @return an array of paths that match the path pattern + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All user code that may potentially use the Hadoop Distributed + File System should be written to use a FileSystem object. The + Hadoop DFS is a multi-machine system that appears as a single + disk. It's useful because of its fault tolerance and potentially + very large capacity. + +

+ The local implementation is {@link LocalFileSystem} and distributed + implementation is DistributedFileSystem.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FilterFileSystem contains + some other file system, which it uses as + its basic file system, possibly transforming + the data along the way or providing additional + functionality. The class FilterFileSystem + itself simply overrides all methods of + FileSystem with versions that + pass all requests to the contained file + system. Subclasses of FilterFileSystem + may further override some of these methods + and may also provide additional methods + and fields.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + buf at offset + and checksum into checksum. + The method is used for implementing read, therefore, it should be optimized + for sequential reading + @param pos chunkPos + @param buf desitination buffer + @param offset offset in buf at which to store data + @param len maximun number of bytes to read + @return number of bytes read]]> + + + + + + + + + + + + + + + + + -1 if the end of the + stream is reached. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + This method implements the general contract of the corresponding + {@link InputStream#read(byte[], int, int) read} method of + the {@link InputStream} class. As an additional + convenience, it attempts to read as many bytes as possible by repeatedly + invoking the read method of the underlying stream. This + iterated read continues until one of the following + conditions becomes true:

    + +
  • The specified number of bytes have been read, + +
  • The read method of the underlying stream returns + -1, indicating end-of-file. + +
If the first read on the underlying stream returns + -1 to indicate end-of-file then this method returns + -1. Otherwise this method returns the number of bytes + actually read. + + @param b destination buffer. + @param off offset at which to start storing bytes. + @param len maximum number of bytes to read. + @return the number of bytes read, or -1 if the end of + the stream has been reached. + @exception IOException if an I/O error occurs. + ChecksumException if any checksum error occurs]]> +
+ + + + + + + + + + + + + + + + + + n bytes of data from the + input stream. + +

This method may skip more bytes than are remaining in the backing + file. This produces no exception and the number of bytes skipped + may include some number of bytes that were beyond the EOF of the + backing file. Attempting to read from the stream after skipping past + the end will result in -1 indicating the end of the file. + +

If n is negative, no bytes are skipped. + + @param n the number of bytes to be skipped. + @return the actual number of bytes skipped. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to skip to is corrupted]]> + + + + + + + This method may seek past the end of the file. + This produces no exception and an attempt to read from + the stream will result in -1 indicating the end of the file. + + @param pos the postion to seek to. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to seek to is corrupted]]> + + + + + + + + + + len bytes from + stm + + @param stm an input stream + @param buf destiniation buffer + @param offset offset at which to store data + @param len number of bytes to read + @return actual number of bytes read + @throws IOException if there is any IO error]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + len bytes from the specified byte array + starting at offset off and generate a checksum for + each data chunk. + +

This method stores bytes from the given array into this + stream's buffer before it gets checksumed. The buffer gets checksumed + and flushed to the underlying output stream when all data + in a checksum chunk are in the buffer. If the buffer is empty and + requested length is at least as large as the size of next checksum chunk + size, this method will checksum and write the chunk directly + to the underlying output stream. Thus it avoids uneccessary data copy. + + @param b the data. + @param off the start offset in the data. + @param len the number of bytes to write. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if and only if pathname + should be included]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + trash feature. Files are moved to a user's trash + directory, a subdirectory of their home directory named ".Trash". Files are + initially moved to a current sub-directory of the trash directory. + Within that sub-directory their original path is preserved. Periodically + one may checkpoint the current trash and remove older checkpoints. (This + design permits trash management without enumeration of the full trash + content, without date support in the filesystem, and without clock + synchronization.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} backed by an FTP client provided by Apache Commons Net. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is a tool for migrating data from an older to a newer version + of an S3 filesystem. +

+

+ All files in the filesystem are migrated by re-writing the block metadata + - no datafiles are touched. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + Extracts AWS credentials from the filesystem URI or configuration. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A block-based {@link FileSystem} backed by + Amazon S3. +

+ @see NativeS3FileSystem]]> +
+
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If f is a file, this method will make a single call to S3. + If f is a directory, this method will make a maximum of + (n / 1000) + 2 calls to S3, where n is the total number of + files and directories contained directly in f. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} for reading and writing files stored on + Amazon S3. + Unlike {@link org.apache.hadoop.fs.s3.S3FileSystem} this implementation + stores files on S3 in their + native form so they can be read by other S3 tools. +

+ @see org.apache.hadoop.fs.s3.S3FileSystem]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + @param name The name of the server + @param port The port to use on the server + @param findPort whether the server should start at the given port and + increment by 1 until it finds a free port. + @param conf Configuration]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points to the log directory + "/static/" -> points to common static files (src/webapps/static) + "/" -> the jsp server code from (src/webapps/)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nth value.]]> + + + + + + + + + + + + + + + + + + + + + nth value in the file.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + public class IntArrayWritable extends ArrayWritable { + public IntArrayWritable() { + super(IntWritable.class); + } + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a ByteWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataInputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using DataInput methods ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataOutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using DataOutput methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + the class of the item + @param conf the configuration to store + @param item the object to be stored + @param keyName the name of the key to use + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param items the objects to be stored + @param keyName the name of the key to use + @throws IndexOutOfBoundsException if the items array is empty + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + DefaultStringifier offers convenience methods to store/load objects to/from + the configuration. + + @param the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a DoubleWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a FloatWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When two sequence files, which have same Key type but different Value + types, are mapped out to reduce, multiple Value types is not allowed. + In this case, this class can help you wrap instances with different types. +

+ +

+ Compared with ObjectWritable, this class is much more effective, + because ObjectWritable will append the class declaration as a String + into the output file in every Key-Value pair. +

+ +

+ Generic Writable implements {@link Configurable} interface, so that it will be + configured by the framework. The configuration is passed to the wrapped objects + implementing {@link Configurable} interface before deserialization. +

+ + how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines + the classes which will be wrapped in GenericObject in application. + Attention: this classes defined in getTypes() method, must + implement Writable interface. +

+ + The code looks like this: +
+ public class GenericObject extends GenericWritable {
+ 
+   private static Class[] CLASSES = {
+               ClassType1.class, 
+               ClassType2.class,
+               ClassType3.class,
+               };
+
+   protected Class[] getTypes() {
+       return CLASSES;
+   }
+
+ }
+ 
+ + @since Nov 8, 2006]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new InputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using InputStream methods ...
+ }
+ 
+ @see DataInputBuffer + @see DataOutput]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a IntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + closes the input and output streams + at the end. + @param in InputStrem to read from + @param out OutputStream to write to + @param conf the Configuration object]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ignore any {@link IOException} or + null pointers. Must only be used for cleanup in exception handlers. + @param log the log to record problems to at debug level. Can be null. + @param closeables the objects to close]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a LongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A map is a directory containing two files, the data file, + containing all keys and values in the map, and a smaller index + file, containing a fraction of the keys. The fraction is determined by + {@link Writer#getIndexInterval()}. + +

The index file is read entirely into memory. Thus key implementations + should try to keep themselves small. + +

Map files are created by adding entries in-order. To maintain a large + database, perform updates by copying the previous version of a database and + merging in a sorted change list, to create a new version of the database in + a new file. Sorting large change lists can be done with {@link + SequenceFile.Sorter}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key and + val. Returns true if such a pair exists and false when at + the end of the map]]> + + + + + + + + + + + + + + + + key or if it does not exist, at the first entry + after the named key. + +- * @param key - key that we're trying to find +- * @param val - data value if key is found +- * @return - the key that was the closest match or null if eof.]]> + + + + + + + + + key does not exist, return + the first entry that falls just before the key. Otherwise, + return the record that sorts just after. + @return - the key that was the closest match or null if eof.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is an MD5Hash whose digest contains the + same values.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new OutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ OutputBuffer buffer = new OutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using OutputStream methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
+ @see DataOutputBuffer + @see InputBuffer]]> +
+
+ + + + + + + + + + + + + + + A {@link Comparator} that operates directly on byte representations of + objects. +

+ @param + @see DeserializerComparator]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SequenceFiles are flat files consisting of binary key/value + pairs. + +

SequenceFile provides {@link Writer}, {@link Reader} and + {@link Sorter} classes for writing, reading and sorting respectively.

+ + There are three SequenceFile Writers based on the + {@link CompressionType} used to compress key/value pairs: +
    +
  1. + Writer : Uncompressed records. +
  2. +
  3. + RecordCompressWriter : Record-compressed files, only compress + values. +
  4. +
  5. + BlockCompressWriter : Block-compressed files, both keys & + values are collected in 'blocks' + separately and compressed. The size of + the 'block' is configurable. +
+ +

The actual compression algorithm used to compress key and/or values can be + specified by using the appropriate {@link CompressionCodec}.

+ +

The recommended way is to use the static createWriter methods + provided by the SequenceFile to chose the preferred format.

+ +

The {@link Reader} acts as the bridge and can read any of the above + SequenceFile formats.

+ +

SequenceFile Formats

+ +

Essentially there are 3 different formats for SequenceFiles + depending on the CompressionType specified. All of them share a + common header described below. + +

+
    +
  • + version - 3 bytes of magic header SEQ, followed by 1 byte of actual + version number (e.g. SEQ4 or SEQ6) +
  • +
  • + keyClassName -key class +
  • +
  • + valueClassName - value class +
  • +
  • + compression - A boolean which specifies if compression is turned on for + keys/values in this file. +
  • +
  • + blockCompression - A boolean which specifies if block-compression is + turned on for keys/values in this file. +
  • +
  • + compression codec - CompressionCodec class which is used for + compression of keys and/or values (if compression is + enabled). +
  • +
  • + metadata - {@link Metadata} for this file. +
  • +
  • + sync - A sync marker to denote end of the header. +
  • +
+ +
Uncompressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Record-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Compressed Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Block-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record Block +
      +
    • Compressed key-lengths block-size
    • +
    • Compressed key-lengths block
    • +
    • Compressed keys block-size
    • +
    • Compressed keys block
    • +
    • Compressed value-lengths block-size
    • +
    • Compressed value-lengths block
    • +
    • Compressed values block-size
    • +
    • Compressed values block
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +

The compressed blocks of key lengths and value lengths consist of the + actual lengths of individual keys/values encoded in ZeroCompressedInteger + format.

+ + @see CompressionCodec]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key, skipping its + value. True if another entry exists, and false at end of file.]]> + + + + + + + + key and + val. Returns true if such a pair exists and false when at + end of file]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The position passed must be a position returned by {@link + SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary + position, use {@link SequenceFile.Reader#sync(long)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SegmentDescriptor + @param segments the list of SegmentDescriptors + @param tmpDir the directory to write temporary files into + @return RawKeyValueIterator + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For best performance, applications should make sure that the {@link + Writable#readFields(DataInput)} implementation of their keys is + very efficient. In particular, it should avoid allocating memory.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This always returns a synchronized position. In other words, + immediately after calling {@link SequenceFile.Reader#seek(long)} with a position + returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However + the key may be earlier in the file than key last written when this + method was called (e.g., with block-compression, it may be the first key + in the block that was being written when this method was called).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key. Returns + true if such a key exists and false when at the end of the set.]]> + + + + + + + key. + Returns key, or null if no match exists.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + position. Note that this + method avoids using the converter or doing String instatiation + @return the Unicode scalar value at position or -1 + if the position is invalid or points to a + trailing byte]]> + + + + + + + + + + what in the backing + buffer, starting as position start. The starting + position is measured in bytes and the return value is in + terms of byte position in the buffer. The backing buffer is + not converted to a string for this operation. + @return byte position of the first occurence of the search + string in the UTF-8 buffer or -1 if not found]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a Text with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException.]]> + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException. + @return ByteBuffer: bytes stores at ByteBuffer.array() + and length is ByteBuffer.limit()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In + addition, it provides methods for string traversal without converting the + byte array to a string.

Also includes utilities for + serializing/deserialing a string, coding/decoding a string, checking if a + byte array contains valid UTF8 code, calculating the length of an encoded + string.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a UTF8 with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + Also includes utilities for efficiently reading and writing UTF-8. + + @deprecated replaced by Text]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This is useful when a class may evolve, so that instances written by the + old version of the class may still be processed by the new version. To + handle this situation, {@link #readFields(DataInput)} + implementations should catch {@link VersionMismatchException}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VIntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VLongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + out. + + @param out DataOuput to serialize this object into. + @throws IOException]]> + + + + + + + in. + +

For efficiency, implementations should attempt to re-use storage in the + existing object where possible.

+ + @param in DataInput to deseriablize this object from. + @throws IOException]]> +
+ + + Any key or value type in the Hadoop Map-Reduce + framework implements this interface.

+ +

Implementations typically implement a static read(DataInput) + method which constructs a new instance, calls {@link #readFields(DataInput)} + and returns the instance.

+ +

Example:

+

+     public class MyWritable implements Writable {
+       // Some data     
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public static MyWritable read(DataInput in) throws IOException {
+         MyWritable w = new MyWritable();
+         w.readFields(in);
+         return w;
+       }
+     }
+ 

]]> +
+ + + + + + + + WritableComparables can be compared to each other, typically + via Comparators. Any type which is to be used as a + key in the Hadoop Map-Reduce framework should implement this + interface.

+ +

Example:

+

+     public class MyWritableComparable implements WritableComparable {
+       // Some data
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public int compareTo(MyWritableComparable w) {
+         int thisValue = this.value;
+         int thatValue = ((IntWritable)o).value;
+         return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+       }
+     }
+ 

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The default implementation reads the data into two {@link + WritableComparable}s (using {@link + Writable#readFields(DataInput)}, then calls {@link + #compare(WritableComparable,WritableComparable)}.]]> + + + + + + + The default implementation uses the natural ordering, calling {@link + Comparable#compareTo(Object)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This base implemenation uses the natural ordering. To define alternate + orderings, override {@link #compare(WritableComparable,WritableComparable)}. + +

One may optimize compare-intensive operations by overriding + {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are + provided to assist in optimized implementations of this method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum type + @param in DataInput to read from + @param enumType Class type of Enum + @return Enum represented by String read from DataInput + @throws IOException]]> + + + + + + + + + + + + + + + + len number of bytes in input streamin + @param in input stream + @param len number of bytes to skip + @throws IOException when skipped less number of bytes]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CompressionCodec for which to get the + Compressor + @return Compressor for the given + CompressionCodec from the pool or a new one]]> + + + + + + CompressionCodec for which to get the + Decompressor + @return Decompressor for the given + CompressionCodec the pool or a new one]]> + + + + + + Compressor to be returned to the pool]]> + + + + + + Decompressor to be returned to the + pool]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Implementations are assumed to be buffered. This permits clients to + reposition the underlying input stream then call {@link #resetState()}, + without having to also synchronize client buffers.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + true if a preset dictionary is needed for decompression. + @return true if a preset dictionary is needed for decompression]]> + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-lzo library is loaded & initialized; + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair. + http://www.oberhumer.com/opensource/lzo/]]> + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair compatible with lzop. + http://www.lzop.org/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FIXME: This array should be in a private or package private location, + since it could be modified by malicious code. +

]]> +
+ + + + This interface is public for historical purposes. You should have no need to + use it. +

]]> +
+ + + + + + + + + + Although BZip2 headers are marked with the magic "Bz" this + constructor expects the next byte in the stream to be the first one after + the magic. Thus callers have to skip the first two bytes. Otherwise this + constructor will throw an exception. +

+ + @throws IOException + if the stream content is malformed or an I/O error occurs. + @throws NullPointerException + if in == null]]> +
+
+ + + + + + + + + + + + + + + The decompression requires large amounts of memory. Thus you should call the + {@link #close() close()} method as soon as possible, to force + CBZip2InputStream to release the allocated memory. See + {@link CBZip2OutputStream CBZip2OutputStream} for information about memory + usage. +

+ +

+ CBZip2InputStream reads bytes from the compressed source stream via + the single byte {@link java.io.InputStream#read() read()} method exclusively. + Thus you should consider to use a buffered source stream. +

+ +

+ Instances of this class are not threadsafe. +

]]> +
+
+ + + + + + + + CBZip2OutputStream with a blocksize of 900k. + +

+ Attention: The caller is resonsible to write the two BZip2 magic + bytes "BZ" to the specified stream prior to calling this + constructor. +

+ + @param out * + the destination stream. + + @throws IOException + if an I/O error occurs in the specified stream. + @throws NullPointerException + if out == null.]]> +
+
+ + + + CBZip2OutputStream with specified blocksize. + +

+ Attention: The caller is resonsible to write the two BZip2 magic + bytes "BZ" to the specified stream prior to calling this + constructor. +

+ + + @param out + the destination stream. + @param blockSize + the blockSize as 100k units. + + @throws IOException + if an I/O error occurs in the specified stream. + @throws IllegalArgumentException + if (blockSize < 1) || (blockSize > 9). + @throws NullPointerException + if out == null. + + @see #MIN_BLOCKSIZE + @see #MAX_BLOCKSIZE]]> +
+
+ + + + + + + + + + + + + inputLength this method returns MAX_BLOCKSIZE + always. + + @param inputLength + The length of the data which will be compressed by + CBZip2OutputStream.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + == 1.]]> + + + + + == 9.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If you are ever unlucky/improbable enough to get a stack overflow whilst + sorting, increase the following constant and try again. In practice I + have never seen the stack go above 27 elems, so the following limit seems + very generous. +

]]> +
+
+ + + The compression requires large amounts of memory. Thus you should call the + {@link #close() close()} method as soon as possible, to force + CBZip2OutputStream to release the allocated memory. +

+ +

+ You can shrink the amount of allocated memory and maybe raise the compression + speed by choosing a lower blocksize, which in turn may cause a lower + compression ratio. You can avoid unnecessary memory allocation by avoiding + using a blocksize which is bigger than the size of the input. +

+ +

+ You can compute the memory usage for compressing by the following formula: +

+ +
+ <code>400k + (9 * blocksize)</code>.
+ 
+ +

+ To get the memory required for decompression by {@link CBZip2InputStream + CBZip2InputStream} use +

+ +
+ <code>65k + (5 * blocksize)</code>.
+ 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Memory usage by blocksize
Blocksize Compression
+ memory usage
Decompression
+ memory usage
100k1300k565k
200k2200k1065k
300k3100k1565k
400k4000k2065k
500k4900k2565k
600k5800k3065k
700k6700k3565k
800k7600k4065k
900k8500k4565k
+ +

+ For decompression CBZip2InputStream allocates less memory if the + bzipped input is smaller than one block. +

+ +

+ Instances of this class are not threadsafe. +

+ +

+ TODO: Update to BZip2 1.0.1 +

]]> +
+
+ +
+ + + + + + + + + + + + + + + + + true if lzo compressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if lzo decompressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-zlib is loaded & initialized + and can be loaded for this job, else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keep trying a limited number of times, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying for a maximum time, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by the number of tries so far. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by a random + number in the range of [0, 2 to the number of retries) +

]]> +
+
+ + + + + + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + + + A retry policy for RemoteException + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + Try once, and fail by re-throwing the exception. + This corresponds to having no retry mechanism in place. +

]]> +
+
+ + + + Try once, and fail silently for void methods, or by + re-throwing the exception for non-void methods. +

]]> +
+
+ + + + Keep trying forever. +

]]> +
+
+ + + A collection of useful implementations of {@link RetryPolicy}. +

]]> +
+
+ + + + + + + + + + Determines whether the framework should retry a + method for the given exception, and the number + of retries that have been made for that operation + so far. +

+ @param e The exception that caused the method to fail. + @param retries The number of times the method has been retried. + @return true if the method should be retried, + false if the method should not be retried + but shouldn't fail with an exception (only for void methods). + @throws Exception The re-thrown exception e indicating + that the method failed and should not be retried further.]]> +
+
+ + + Specifies a policy for retrying method failures. + Implementations of this interface should be immutable. +

]]> +
+
+ + + + + + + + + + + + Create a proxy for an interface of an implementation class + using the same retry policy for each method in the interface. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param retryPolicy the policy for retirying method call failures + @return the retry proxy]]> +
+
+ + + + + + + Create a proxy for an interface of an implementation class + using the a set of retry policies specified by method name. + If no retry policy is defined for a method then a default of + {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param methodNameToPolicyMap a map of method names to retry policies + @return the retry proxy]]> +
+
+ + + A factory for creating retry proxies. +

]]> +
+
+ +
+ + + + + + + + Prepare the deserializer for reading.

]]> +
+
+ + + + + + Deserialize the next object from the underlying input stream. + If the object t is non-null then this deserializer + may set its internal state to the next object read from the input + stream. Otherwise, if the object t is null a new + deserialized object will be created. +

+ @return the deserialized object]]> +
+
+ + + + Close the underlying input stream and clear up any resources.

]]> +
+
+ + + Provides a facility for deserializing objects of type from an + {@link InputStream}. +

+ +

+ Deserializers are stateful, but must not buffer the input since + other producers may read from the input between calls to + {@link #deserialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + A {@link RawComparator} that uses a {@link Deserializer} to deserialize + the objects to be compared so that the standard {@link Comparator} can + be used to compare them. +

+

+ One may optimize compare-intensive operations by using a custom + implementation of {@link RawComparator} that operates directly + on byte representations. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + An experimental {@link Serialization} for Java {@link Serializable} classes. +

+ @see JavaSerializationComparator]]> +
+
+ + + + + + + + + + + + + A {@link RawComparator} that uses a {@link JavaSerialization} + {@link Deserializer} to deserialize objects that are then compared via + their {@link Comparable} interfaces. +

+ @param + @see JavaSerialization]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + Encapsulates a {@link Serializer}/{@link Deserializer} pair. +

+ @param ]]> +
+
+ + + + + + + Serializations are found by reading the io.serializations + property from conf, which is a comma-delimited list of + classnames. +

]]> +
+
+ + + + + + + + + + + + A factory for {@link Serialization}s. +

]]> +
+
+ + + + + + + + Prepare the serializer for writing.

]]> +
+
+ + + + + Serialize t to the underlying output stream.

]]> +
+
+ + + + Close the underlying output stream and clear up any resources.

]]> +
+
+ + + Provides a facility for serializing objects of type to an + {@link OutputStream}. +

+ +

+ Serializers are stateful, but must not buffer the output since + other producers may write to the output between calls to + {@link #serialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param, to the IPC server running at + address, returning the value. Throws exceptions if there are + network problems or if the remote code threw an exception.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unwraps any IOException. + + @param lookupTypes the desired exception class. + @return IOException, which is either the lookupClass exception or this.]]> + + + + + This unwraps any Throwable that has a constructor taking + a String as a parameter. + Otherwise it returns this. + + @return Throwable]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + protocol is a Java interface. All parameters and return types must + be one of: + +
  • a primitive type, boolean, byte, + char, short, int, long, + float, double, or void; or
  • + +
  • a {@link String}; or
  • + +
  • a {@link Writable}; or
  • + +
  • an array of the above types
+ + All methods in the protocol should throw only IOException. No field data of + the protocol instance is transmitted.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + handlerCount determines + the number of handler threads that will be used to process calls.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #rpcQueueTime}.inc(time)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker, + as {@link JobTracker.State} + + @return the current state of the JobTracker.]]> + + + + + + + + + + + + ClusterStatus provides clients with information such as: +

    +
  1. + Size of the cluster. +
  2. +
  3. + Task capacity of the cluster. +
  4. +
  5. + The number of currently running map & reduce tasks. +
  6. +
  7. + State of the JobTracker. +
  8. +

+ +

Clients can query for the latest ClusterStatus, via + {@link JobClient#getClusterStatus()}.

+ + @see JobClient]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter can be of + any {@link Enum} type.

+ +

Counters are bunched into {@link Group}s, each comprising of + counters from a particular Enum class.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of counters, comprising of counters from a particular + counter {@link Enum} class. + +

Grouphandles localization of the class name and the + counter names.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param fs the file system that the file is on + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides a generic implementation of + {@link #getSplits(JobConf, int)}. + Subclasses of FileInputFormat can also override the + {@link #isSplitable(FileSystem, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Note: The following is valid only if the {@link OutputCommitter} + is {@link FileOutputCommitter}. If OutputCommitter is not + a FileOutputCommitter, the task's temporary output + directory is same as {@link #getOutputPath(JobConf)} i.e. + ${mapred.output.dir}$

+ +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the attemptid, say + attempt_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in ${mapred.work.output.dir} during execution + of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the + framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

Note: the value of ${mapred.work.output.dir} during + execution of a particular task-attempt is actually + ${mapred.output.dir}/_temporary/_{$taskid}, and this value is + set by the map-reduce framework. So, just create any side-files in the + path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce + task to take advantage of this feature.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+
+ + + + + + + + + + + + + The generated name can be used to create custom files from within the + different tasks for the job, the names for different tasks will not collide + with each other.

+ +

The given name is postfixed with the task type, 'm' for maps, 'r' for + reduces and the task partition number. For example, give a name 'test' + running on the first map o the job the generated name will be + 'test-m-00000'.

+ + @param conf the configuration for the job. + @param name the name to make unique. + @return a unique name accross all tasks of the job.]]> +
+
+ + + + + The path can be used to create custom files from within the map and + reduce tasks. The path name will be unique for each task. The path parent + will be the job output directory.

ls + +

This method uses the {@link #getUniqueName} method to make the file name + unique for the task.

+ + @param conf the configuration for the job. + @param name the name for the file. + @return a unique path accross all tasks of the job.]]> +
+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. + + @param job job configuration. + @param numSplits the desired number of splits, a hint. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + It is the responsibility of the RecordReader to respect + record boundaries while processing the logical split to present a + record-oriented view to the individual task.

+ + @param split the {@link InputSplit} + @param job the job that this split belongs to + @return a {@link RecordReader}]]> +
+
+ + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibilty to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see JobClient + @see FileInputFormat]]> + + + + + + + + + + InputSplit. + + @return the number of bytes in the input split. + @throws IOException]]> + + + + + + InputSplit is + located as an array of Strings. + @throws IOException]]> + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + jobid doesn't correspond to any known job. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient is the primary interface for the user-job to interact + with the {@link JobTracker}. + + JobClient provides facilities to submit jobs, track their + progress, access component-tasks' reports/logs, get the Map-Reduce cluster + status information etc. + +

The job submission process involves: +

    +
  1. + Checking the input and output specifications of the job. +
  2. +
  3. + Computing the {@link InputSplit}s for the job. +
  4. +
  5. + Setup the requisite accounting information for the {@link DistributedCache} + of the job, if necessary. +
  6. +
  7. + Copying the job's jar and configuration to the map-reduce system directory + on the distributed file-system. +
  8. +
  9. + Submitting the job to the JobTracker and optionally monitoring + it's status. +
  10. +

+ + Normally the user creates the application, describes various facets of the + job via {@link JobConf} and then uses the JobClient to submit + the job and monitor its progress. + +

Here is an example on how to use JobClient:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     job.setInputPath(new Path("in"));
+     job.setOutputPath(new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+
+     // Submit the job, then poll for progress until the job is complete
+     JobClient.runJob(job);
+ 

+ +

Job Control

+ +

At times clients would chain map-reduce jobs to accomplish complex tasks + which cannot be done via a single map-reduce job. This is fairly easy since + the output of the job, typically, goes to distributed file-system and that + can be used as the input for the next job.

+ +

However, this also means that the onus on ensuring jobs are complete + (success/failure) lies squarely on the clients. In such situations the + various job-control options are: +

    +
  1. + {@link #runJob(JobConf)} : submits the job and returns only after + the job has completed. +
  2. +
  3. + {@link #submitJob(JobConf)} : only submits the job, then poll the + returned handle to the {@link RunningJob} to query status and make + scheduling decisions. +
  4. +
  5. + {@link JobConf#setJobEndNotificationURI(String)} : setup a notification + on job-completion, thus avoiding polling. +
  6. +

+ + @see JobConf + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If the parameter {@code loadDefaults} is false, the new instance + will not load resources from the default files. + + @param loadDefaults specifies whether to load from the default files]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if framework should keep the intermediate files + for failed tasks, false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the outputs of the maps are to be compressed, + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This comparator should be provided if the equivalence rules for keys + for sorting the intermediates are different from those for grouping keys + before each call to + {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ +

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + in a single call to the reduce function if K1 and K2 compare as equal.

+ +

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + how keys are sorted, this can be used in conjunction to simulate + secondary sort on values.

+ +

Note: This is not a guarantee of the reduce sort being + stable in any sense. (In any case, with the order of available + map-outputs to the reduce being non-deterministic, it wouldn't make + that much sense.)

+ + @param theClass the comparator class to be used for grouping keys. + It should implement RawComparator. + @see #setOutputKeyComparatorClass(Class)]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. Typically the combiner is same as the + the {@link Reducer} for the job i.e. {@link #getReducerClass()}. + + @return the user-defined combiner class used to combine map-outputs.]]> + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. + +

The combiner is a task-level aggregation operation which, in some cases, + helps to cut down the amount of data transferred from the {@link Mapper} to + the {@link Reducer}, leading to better performance.

+ +

Typically the combiner is same as the Reducer for the + job i.e. {@link #setReducerClass(Class)}.

+ + @param theClass the user-defined combiner class used to combine + map-outputs.]]> +
+
+ + + true. + + @return true if speculative execution be used for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on, else false.]]> + + + + + true. + + @return true if speculative execution be + used for this job for map tasks, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for map tasks, + else false.]]> + + + + + true. + + @return true if speculative execution be used + for reduce tasks for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for reduce tasks, + else false.]]> + + + + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + Note: This is only a hint to the framework. The actual + number of spawned map tasks depends on the number of {@link InputSplit}s + generated by the job's {@link InputFormat#getSplits(JobConf, int)}. + + A custom {@link InputFormat} is typically used to accurately control + the number of map tasks for the job.

+ +

How many maps?

+ +

The number of maps is usually driven by the total size of the inputs + i.e. total number of blocks of the input files.

+ +

The right level of parallelism for maps seems to be around 10-100 maps + per-node, although it has been set up to 300 or so for very cpu-light map + tasks. Task setup takes awhile, so it is best if the maps take at least a + minute to execute.

+ +

The default behavior of file-based {@link InputFormat}s is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of input files. However, the {@link FileSystem} blocksize of the + input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Thus, if you expect 10TB of input data and have a blocksize of 128MB, + you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is + used to set it even higher.

+ + @param n the number of map tasks for this job. + @see InputFormat#getSplits(JobConf, int) + @see FileInputFormat + @see FileSystem#getDefaultBlockSize() + @see FileStatus#getBlockSize()]]> +
+
+ + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + How many reduces? + +

The right number of reduces seems to be 0.95 or + 1.75 multiplied by (<no. of nodes> * + + mapred.tasktracker.reduce.tasks.maximum). +

+ +

With 0.95 all of the reduces can launch immediately and + start transfering map outputs as the maps finish. With 1.75 + the faster nodes will finish their first round of reduces and launch a + second wave of reduces doing a much better job of load balancing.

+ +

Increasing the number of reduces increases the framework overhead, but + increases load balancing and lowers the cost of failures.

+ +

The scaling factors above are slightly less than whole numbers to + reserve a few reduce slots in the framework for speculative-tasks, failures + etc.

+ +

Reducer NONE

+ +

It is legal to set the number of reduce-tasks to zero.

+ +

In this case the output of the map-tasks directly go to distributed + file-system, to the path set by + {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the + framework doesn't sort the map-outputs before writing it out to HDFS.

+ + @param n the number of reduce tasks for this job.]]> +
+
+ + + mapred.map.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per map task.]]> + + + + + + + + + + + mapred.reduce.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per reduce task.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noFailures, the + tasktracker is blacklisted for this job. + + @param noFailures maximum no. of failures of a given job per tasktracker.]]> + + + + + blacklisted for this job. + + @return the maximum no. of failures of a given job per tasktracker.]]> + + + + + failed. + + Defaults to zero, i.e. any failed map-task results in + the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + failed. + + Defaults to zero, i.e. any failed reduce-task results + in the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The debug script can aid debugging of failed map tasks. The script is + given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script needs to be symlinked.

+ +

Here is an example on how to submit a script +

+ job.setMapDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param mDbgScript the script name]]> +
+
+ + + + + + + + + The debug script can aid debugging of failed reduce tasks. The script + is given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script file needs to be symlinked

+ +

Here is an example on how to submit a script +

+ job.setReduceDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param rDbgScript the script name]]> +
+
+ + + + + + + + null if it hasn't + been set. + @see #setJobEndNotificationURI(String)]]> + + + + + + The uri can contain 2 special parameters: $jobId and + $jobStatus. Those, if present, are replaced by the job's + identifier and completion-status respectively.

+ +

This is typically used by application-writers to implement chaining of + Map-Reduce jobs in an asynchronous manner.

+ + @param uri the job end notification uri + @see JobStatus + @see Job Completion and Chaining]]> +
+
+ + + + When a job starts, a shared directory is created at location + + ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ . + This directory is exposed to the users through + job.local.dir . + So, the tasks can use this space + as scratch space and share files among them.

+ This value is available as System property also. + + @return The localized job specific shared directory]]> +
+
+ + + + + + + + + + + + + + + + + + JobConf is the primary interface for a user to describe a + map-reduce job to the Hadoop framework for execution. The framework tries to + faithfully execute the job as-is described by JobConf, however: +
    +
  1. + Some configuration parameters might have been marked as + + final by administrators and hence cannot be altered. +
  2. +
  3. + While some job parameters are straight-forward to set + (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly + rest of the framework and/or job-configuration and is relatively more + complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}). +
  4. +

+ +

JobConf typically specifies the {@link Mapper}, combiner + (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and + {@link OutputFormat} implementations to be used etc. + +

Optionally JobConf is used to specify other advanced facets + of the job such as Comparators to be used, files to be put in + the {@link DistributedCache}, whether or not intermediate and/or job outputs + are to be compressed (and how), debugability via user-provided scripts + ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), + for doing post-processing on task logs, task's stdout, stderr, syslog. + and etc.

+ +

Here is an example on how to configure a job via JobConf:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     FileInputFormat.setInputPaths(job, new Path("in"));
+     FileOutputFormat.setOutputPath(job, new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setCombinerClass(MyJob.MyReducer.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+     
+     job.setInputFormat(SequenceFileInputFormat.class);
+     job.setOutputFormat(SequenceFileOutputFormat.class);
+ 

+ + @see JobClient + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + any job + run on the jobtracker started at 200707121733, we would use : +
 
+ JobID.getTaskIDsPattern("200707121733", null);
+ 
+ which will return : +
 "job_200707121733_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @return a regex pattern matching JobIDs]]> +
+
+ + + An example JobID is : + job_200707121733_0003 , which represents the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse JobID strings, but rather + use appropriate constructors or {@link #forName(String)} method. + + @see TaskID + @see TaskAttemptID + @see JobTracker#getNewJobId() + @see JobTracker#getStartTime()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + "N/A" + + @return Scheduling information associated to particular Job Queue]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -archives + -files inputjar args]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + zero. + + @param conf configuration for the JobTracker. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Output pairs need not be of the same types as input pairs. A given + input pair may map to zero or many output pairs. Output pairs are + collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the input key. + @param value the input value. + @param output collects mapped keys and values. + @param reporter facility to report progress.]]> +
+ + + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link JobConf} for the + job via the {@link JobConfigurable#configure(JobConf)} and initialize + themselves. Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

The framework then calls + {@link #map(Object, Object, OutputCollector, Reporter)} + for each key/value pair in the InputSplit for that task.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the grouping by specifying + a Comparator via + {@link JobConf#setOutputKeyComparatorClass(Class)}.

+ +

The grouped Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

The intermediate, grouped outputs are always stored in + {@link SequenceFile}s. Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the JobConf.

+ +

If the job has + zero + reduces then the output of the Mapper is directly written + to the {@link FileSystem} without grouping by keys.

+ +

Example:

+

+     public class MyMapper<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Mapper<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+       
+       private String mapTaskId;
+       private String inputFile;
+       private int noRecords = 0;
+       
+       public void configure(JobConf job) {
+         mapTaskId = job.get("mapred.task.id");
+         inputFile = job.get("mapred.input.file");
+       }
+       
+       public void map(K key, V val,
+                       OutputCollector<K, V> output, Reporter reporter)
+       throws IOException {
+         // Process the <key, value> pair (assume this takes a while)
+         // ...
+         // ...
+         
+         // Let the framework know that we are alive, and kicking!
+         // reporter.progress();
+         
+         // Process some more
+         // ...
+         // ...
+         
+         // Increment the no. of <key, value> pairs processed
+         ++noRecords;
+
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+        
+         // Every 100 records update application-level status
+         if ((noRecords%100) == 0) {
+           reporter.setStatus(mapTaskId + " processed " + noRecords + 
+                              " from input-file: " + inputFile); 
+         }
+         
+         // Output the result
+         output.collect(key, val);
+       }
+     }
+ 

+ +

Applications may write a custom {@link MapRunnable} to exert greater + control on map processing e.g. multi-threaded Mappers etc.

+ + @see JobConf + @see InputFormat + @see Partitioner + @see Reducer + @see MapReduceBase + @see MapRunnable + @see SequenceFile]]> +
+
+ + + + + + + + + + + + + + + + + + + + + Provides default no-op implementations for a few methods, most non-trivial + applications need to override some of them.

]]> +
+
+ + + + + + + + + + + <key, value> pairs. + +

Mapping of input records to output records is complete when this method + returns.

+ + @param input the {@link RecordReader} to read the input records. + @param output the {@link OutputCollector} to collect the outputrecords. + @param reporter {@link Reporter} to report progress, status-updates etc. + @throws IOException]]> +
+
+ + Custom implementations of MapRunnable can exert greater + control on map processing e.g. multi-threaded, asynchronous mappers etc.

+ + @see Mapper]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly + equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)} + to construct RecordReader's for MultiFileSplit's. + @see MultiFileSplit]]> +
+
+ + + + + + + + + + + + + + + + + th Path]]> + + + + + + + + + + + th Path]]> + + + + + + + + + + + + + + + + + + + + + + + MultiFileSplit can be used to implement {@link RecordReader}'s, with + reading one record per file. + @see FileSplit + @see MultiFileInputFormat]]> + + + + + + + + + + + + + + + <key, value> pairs output by {@link Mapper}s + and {@link Reducer}s. + +

OutputCollector is the generalization of the facility + provided by the Map-Reduce framework to collect data output by either the + Mapper or the Reducer i.e. intermediate outputs + or the output of the job.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OutputCommitter describes the commit of task output for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputCommitter of + the job to:

+

    +
  1. + Setup the job during initialization. For example, create the temporary + output directory for the job during the initialization of the job. +
  2. +
  3. + Cleanup the job after the job completion. For example, remove the + temporary output directory after the job completion. +
  4. +
  5. + Setup the task temporary output. +
  6. +
  7. + Check whether a task needs a commit. This is to avoid the commit + procedure if a task does not need commit. +
  8. +
  9. + Commit of the task output. +
  10. +
  11. + Discard the task commit. +
  12. +
+ + @see FileOutputCommitter + @see JobContext + @see TaskAttemptContext]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param ignored + @param job job configuration. + @throws IOException when output should not be attempted]]> +
+
+ + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter + @see JobConf]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be paritioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0 to 1.0. + @throws IOException]]> + + + + RecordReader reads <key, value> pairs from an + {@link InputSplit}. + +

RecordReader, typically, converts the byte-oriented view of + the input, provided by the InputSplit, and presents a + record-oriented view for the {@link Mapper} & {@link Reducer} tasks for + processing. It thus assumes the responsibility of processing record + boundaries and presenting the tasks with keys and values.

+ + @see InputSplit + @see InputFormat]]> +
+
+ + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param reporter facility to report progress. + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + Reduces values for a given key. + +

The framework calls this method for each + <key, (list of values)> pair in the grouped inputs. + Output values must be of the same type as input values. Input keys must + not be altered. The framework will reuse the key and value objects + that are passed into the reduce, therefore the application should clone + the objects they want to keep a copy of. In many cases, all values are + combined into zero or one value. +

+ +

Output pairs are collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the key. + @param values the list of values to reduce. + @param output to collect keys and combined values. + @param reporter facility to report progress.]]> +
+ + + The number of Reducers for the job is set by the user via + {@link JobConf#setNumReduceTasks(int)}. Reducer implementations + can access the {@link JobConf} for the job via the + {@link JobConfigurable#configure(JobConf)} method and initialize themselves. + Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    Reducer is input the grouped output of a {@link Mapper}. + In the phase the framework, for each Reducer, fetches the + relevant partition of the output of all the Mappers, via HTTP. +

    +
  2. + +
  3. +

    Sort

    + +

    The framework groups Reducer inputs by keys + (since different Mappers may have output the same key) in this + stage.

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    If equivalence rules for keys while grouping the intermediates are + different from those for grouping keys before reduction, then one may + specify a Comparator via + {@link JobConf#setOutputValueGroupingComparator(Class)}.Since + {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to + control how intermediate keys are grouped, these can be used in conjunction + to simulate secondary sort on values.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterator, OutputCollector, Reporter)} + method is called for each <key, (list of values)> pair in + the grouped inputs.

    +

    The output of the reduce task is typically written to the + {@link FileSystem} via + {@link OutputCollector#collect(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+     public class MyReducer<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Reducer<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+        
+       private String reduceTaskId;
+       private int noKeys = 0;
+       
+       public void configure(JobConf job) {
+         reduceTaskId = job.get("mapred.task.id");
+       }
+       
+       public void reduce(K key, Iterator<V> values,
+                          OutputCollector<K, V> output, 
+                          Reporter reporter)
+       throws IOException {
+       
+         // Process
+         int noValues = 0;
+         while (values.hasNext()) {
+           V value = values.next();
+           
+           // Increment the no. of values for this key
+           ++noValues;
+           
+           // Process the <key, value> pair (assume this takes a while)
+           // ...
+           // ...
+           
+           // Let the framework know that we are alive, and kicking!
+           if ((noValues%10) == 0) {
+             reporter.progress();
+           }
+         
+           // Process some more
+           // ...
+           // ...
+           
+           // Output the <key, value> 
+           output.collect(key, value);
+         }
+         
+         // Increment the no. of <key, list of values> pairs processed
+         ++noKeys;
+         
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+         
+         // Every 100 keys update application-level status
+         if ((noKeys%100) == 0) {
+           reporter.setStatus(reduceTaskId + " processed " + noKeys);
+         }
+       }
+     }
+ 

+ + @see Mapper + @see Partitioner + @see Reporter + @see MapReduceBase]]> +
+
+ + + + + + + + + + + + + + + Counter of the given group/name.]]> + + + + + + + Enum. + @param amount A non-negative amount by which the counter is to + be incremented.]]> + + + + + + + + + + + + + + InputSplit that the map is reading from. + @throws UnsupportedOperationException if called outside a mapper]]> + + + + + + + + + {@link Mapper} and {@link Reducer} can use the Reporter + provided to report progress or just indicate that they are alive. In + scenarios where the application takes an insignificant amount of time to + process individual key/value pairs, this is crucial since the framework + might assume that the task has timed-out and kill that task. + +

Applications can also update {@link Counters} via the provided + Reporter .

+ + @see Progressable + @see Counters]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + progress of the job's cleanup-tasks, as a float between 0.0 + and 1.0. When all cleanup tasks have completed, the function returns 1.0. + + @return the progress of the job's cleanup-tasks. + @throws IOException]]> + + + + + + progress of the job's setup-tasks, as a float between 0.0 + and 1.0. When all setup tasks have completed, the function returns 1.0. + + @return the progress of the job's setup-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RunningJob is the user-interface to query for details on a + running Map-Reduce job. + +

Clients can get hold of RunningJob via the {@link JobClient} + and then query the running-job for details such as name, configuration, + progress etc.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This allows the user to specify the key class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + This allows the user to specify the value class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f. The filtering criteria is + MD5(key) % f == 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + f using + the criteria record# % f == 0. + For example, if the frequency is 10, one out of 10 records is returned.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if auto increment + {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}. + false otherwise.]]> + + + + + + + + + + + + + true if auto increment + {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}. + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop provides an optional mode of execution in which the bad records + are detected and skipped in further attempts. + +

This feature can be used when map/reduce tasks crashes deterministically on + certain input. This happens due to bugs in the map/reduce function. The usual + course would be to fix these bugs. But sometimes this is not possible; + perhaps the bug is in third party libraries for which the source code is + not available. Due to this, the task never reaches to completion even with + multiple attempts and complete data for that task is lost.

+ +

With this feature, only a small portion of data is lost surrounding + the bad record, which may be acceptable for some user applications. + see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}

+ +

The skipping mode gets kicked off after certain no of failures + see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}

+ +

In the skipping mode, the map/reduce task maintains the record range which + is getting processed at all times. Before giving the input to the + map/reduce function, it sends this record range to the Task tracker. + If task crashes, the Task tracker knows which one was the last reported + range. On further attempts that range get skipped.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all task attempt IDs + of any jobtracker, in any job, of the first + map task, we would use : +
 
+ TaskAttemptID.getTaskAttemptIDsPattern(null, null, true, 1, null);
+ 
+ which will return : +
 "attempt_[^_]*_[0-9]*_m_000001_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @param attemptId the task attempt number, or null + @return a regex pattern matching TaskAttemptIDs]]> +
+
+ + + An example TaskAttemptID is : + attempt_200707121733_0003_m_000005_0 , which represents the + zeroth task attempt for the fifth map task in the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse TaskAttemptID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the first map task + of any jobtracker, of any job, we would use : +

 
+ TaskID.getTaskIDsPattern(null, null, true, 1);
+ 
+ which will return : +
 "task_[^_]*_[0-9]*_m_000001*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @return a regex pattern matching TaskIDs]]> +
+ + + + An example TaskID is : + task_200707121733_0003_m_000005 , which represents the + fifth map task in the third job running at the jobtracker + started at 200707121733. +

+ Applications should never construct or parse TaskID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.log.dir.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the Job was added.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ([,]*) + func ::= tbl(,"") + class ::= @see java.lang.Class#forName(java.lang.String) + path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) + } + Reads expression from the mapred.join.expr property and + user-supplied join types from mapred.join.define.<ident> + types. Paths supplied to tbl are given as input paths to the + InputFormat class listed. + @see #compose(java.lang.String, java.lang.Class, java.lang.String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ,

) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + mapred.join.define.<ident> to a classname. In the expression + mapred.join.expr, the identifier will be assumed to be a + ComposableRecordReader. + mapred.join.keycomparator can be a classname used to compare keys + in the join. + @see JoinRecordReader + @see MultiFilterRecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + capacity children to position + id in the parent reader. + The id of a root CompositeRecordReader is -1 by convention, but relying + on this is not recommended.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + override(S1,S2,S3) will prefer values + from S3 over S2, and values from S2 over S1 for all keys + emitted from all sources.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [,,...,]]]> + + + + + + + out. + TupleWritable format: + {@code + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Mapper leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Mapper does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Mapper the configuration given for it, + mapperConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain +

+ + @param job job's JobConf to add the Mapper class. + @param klass the Mapper class to add. + @param inputKeyClass mapper input key class. + @param inputValueClass mapper input value class. + @param outputKeyClass mapper output key class. + @param outputValueClass mapper output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param mapperConf a JobConf with the configuration for the Mapper + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + If this method is overriden super.configure(...) should be + invoked at the beginning of the overwriter method.]]> + + + + + + + + + + map(...) methods of the Mappers in the chain.]]> + + + + + + + If this method is overriden super.close() should be + invoked at the end of the overwriter method.]]> + + + + + The Mapper classes are invoked in a chained (or piped) fashion, the output of + the first becomes the input of the second, and so on until the last Mapper, + the output of the last Mapper will be written to the task's output. +

+ The key functionality of this feature is that the Mappers in the chain do not + need to be aware that they are executed in a chain. This enables having + reusable specialized Mappers that can be combined to perform composite + operations within a single task. +

+ Special care has to be taken when creating chains that the key/values output + by a Mapper are valid for the following Mapper in the chain. It is assumed + all Mappers and the Reduce in the chain use maching output and input key and + value classes as no conversion is done by the chaining code. +

+ Using the ChainMapper and the ChainReducer classes is possible to compose + Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And + immediate benefit of this pattern is a dramatic reduction in disk IO. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain. +

+ ChainMapper usage pattern: +

+

+ ...
+ conf.setJobName("chain");
+ conf.setInputFormat(TextInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+ 

+ JobConf mapAConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, AMap.class, LongWritable.class, Text.class, + Text.class, Text.class, true, mapAConf); +

+ JobConf mapBConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, BMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, mapBConf); +

+ JobConf reduceConf = new JobConf(false); + ... + ChainReducer.setReducer(conf, XReduce.class, LongWritable.class, Text.class, + Text.class, Text.class, true, reduceConf); +

+ ChainReducer.addMapper(conf, CMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, null); +

+ ChainReducer.addMapper(conf, DMap.class, LongWritable.class, Text.class, + LongWritable.class, LongWritable.class, true, null); +

+ FileInputFormat.setInputPaths(conf, inDir); + FileOutputFormat.setOutputPath(conf, outDir); + ... +

+ JobClient jc = new JobClient(conf); + RunningJob job = jc.submitJob(conf); + ... +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Reducer leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Reducer does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Reducer the configuration given for it, + reducerConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainReducer, this is done by the setReducer or the addMapper for the last + element in the chain. + + @param job job's JobConf to add the Reducer class. + @param klass the Reducer class to add. + @param inputKeyClass reducer input key class. + @param inputValueClass reducer input value class. + @param outputKeyClass reducer output key class. + @param outputValueClass reducer output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param reducerConf a JobConf with the configuration for the Reducer + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Mapper leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Mapper does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Mapper the configuration given for it, + mapperConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain + . + + @param job chain job's JobConf to add the Mapper class. + @param klass the Mapper class to add. + @param inputKeyClass mapper input key class. + @param inputValueClass mapper input value class. + @param outputKeyClass mapper output key class. + @param outputValueClass mapper output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param mapperConf a JobConf with the configuration for the Mapper + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + If this method is overriden super.configure(...) should be + invoked at the beginning of the overwriter method.]]> + + + + + + + + + + reduce(...) method of the Reducer with the + map(...) methods of the Mappers in the chain.]]> + + + + + + + If this method is overriden super.close() should be + invoked at the end of the overwriter method.]]> + + + + + For each record output by the Reducer, the Mapper classes are invoked in a + chained (or piped) fashion, the output of the first becomes the input of the + second, and so on until the last Mapper, the output of the last Mapper will + be written to the task's output. +

+ The key functionality of this feature is that the Mappers in the chain do not + need to be aware that they are executed after the Reducer or in a chain. + This enables having reusable specialized Mappers that can be combined to + perform composite operations within a single task. +

+ Special care has to be taken when creating chains that the key/values output + by a Mapper are valid for the following Mapper in the chain. It is assumed + all Mappers and the Reduce in the chain use maching output and input key and + value classes as no conversion is done by the chaining code. +

+ Using the ChainMapper and the ChainReducer classes is possible to compose + Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And + immediate benefit of this pattern is a dramatic reduction in disk IO. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainReducer, this is done by the setReducer or the addMapper for the last + element in the chain. +

+ ChainReducer usage pattern: +

+

+ ...
+ conf.setJobName("chain");
+ conf.setInputFormat(TextInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+ 

+ JobConf mapAConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, AMap.class, LongWritable.class, Text.class, + Text.class, Text.class, true, mapAConf); +

+ JobConf mapBConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, BMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, mapBConf); +

+ JobConf reduceConf = new JobConf(false); + ... + ChainReducer.setReducer(conf, XReduce.class, LongWritable.class, Text.class, + Text.class, Text.class, true, reduceConf); +

+ ChainReducer.addMapper(conf, CMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, null); +

+ ChainReducer.addMapper(conf, DMap.class, LongWritable.class, Text.class, + LongWritable.class, LongWritable.class, true, null); +

+ FileInputFormat.setInputPaths(conf, inDir); + FileOutputFormat.setOutputPath(conf, outDir); + ... +

+ JobClient jc = new JobClient(conf); + RunningJob job = jc.submitJob(conf); + ... +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + @param freq The frequency with which records will be emitted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + This will read every split at the client, which is very expensive. + @param freq Probability with which a key will be chosen. + @param numSamples Total number of samples to obtain from all selected + splits.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + Takes the first numSamples / numSplits records from each split. + @param numSamples Total number of samples to obtain from all selected + splits.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the name output is multi, false + if it is single. If the name output is not defined it returns + false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @param conf job conf to add the named output + @param namedOutput named output name, it has to be a word, letters + and numbers only, cannot be the word 'part' as + that is reserved for the + default output. + @param outputFormatClass OutputFormat class. + @param keyClass key class + @param valueClass value class]]> + + + + + + + + + + + + @param conf job conf to add the named output + @param namedOutput named output name, it has to be a word, letters + and numbers only, cannot be the word 'part' as + that is reserved for the + default output. + @param outputFormatClass OutputFormat class. + @param keyClass key class + @param valueClass value class]]> + + + + + + + + By default these counters are disabled. +

+ MultipleOutputs supports counters, by default the are disabled. + The counters group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. + + @param conf job conf to enableadd the named output. + @param enabled indicates if the counters will be enabled or not.]]> +
+
+ + + + + By default these counters are disabled. +

+ MultipleOutputs supports counters, by default the are disabled. + The counters group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. + + + @param conf job conf to enableadd the named output. + @return TRUE if the counters are enabled, FALSE if they are disabled.]]> +
+
+ + + + + + + + + + + + + @param namedOutput the named output name + @param reporter the reporter + @return the output collector for the given named output + @throws IOException thrown if output collector could not be created]]> + + + + + + + + + + + @param namedOutput the named output name + @param multiName the multi name part + @param reporter the reporter + @return the output collector for the given named output + @throws IOException thrown if output collector could not be created]]> + + + + + + + If overriden subclasses must invoke super.close() at the + end of their close() + + @throws java.io.IOException thrown if any of the MultipleOutput files + could not be closed properly.]]> + + + + OutputCollector passed to + the map() and reduce() methods of the + Mapper and Reducer implementations. +

+ Each additional output, or named output, may be configured with its own + OutputFormat, with its own key class and with its own value + class. +

+ A named output can be a single file or a multi file. The later is refered as + a multi named output. +

+ A multi named output is an unbound set of files all sharing the same + OutputFormat, key class and value class configuration. +

+ When named outputs are used within a Mapper implementation, + key/values written to a name output are not part of the reduce phase, only + key/values written to the job OutputCollector are part of the + reduce phase. +

+ MultipleOutputs supports counters, by default the are disabled. The counters + group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. +

+ Job configuration usage pattern is: +

+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+   SequenceFileOutputFormat.class,
+   LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+ 
+

+ Job configuration usage pattern is: +

+
+ public class MOReduce implements
+   Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Map implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured to use this MapRunnable class (using + the JobConf.setMapRunnerClass method) and + the number of thread the thread-pool can use with the + mapred.map.multithreadedrunner.threads property, its default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pairs. Uses + {@link StringTokenizer} to break text into tokens.]]> + + + + + + + + + + + + total.order.partitioner.natural.order is not false, a trie + of the first total.order.partitioner.max.trie.depth(2) + 1 bytes + will be built. Otherwise, keys will be located using a binary search of + the partition keyset using the {@link org.apache.hadoop.io.RawComparator} + defined for this job. The input file must be sorted with the same + comparator and contain {@link + org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.]]> + + + + + + + + + + + + R reduces, there are R-1 + keys in the SequenceFile.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + generateKeyValPairs(Object key, Object value); public void + configure(JobConfjob); } + + The package also provides a base class, ValueAggregatorBaseDescriptor, + implementing the above interface. The user can extend the base class and + implement generateKeyValPairs accordingly. + + The primary work of generateKeyValPairs is to emit one or more key/value + pairs based on the input key/value pair. The key in an output key/value pair + encode two pieces of information: aggregation type and aggregation id. The + value will be aggregated onto the aggregation id according the aggregation + type. + + This class offers a function to generate a map/reduce job using Aggregate + framework. The function takes the following parameters: input directory spec + input format (text or sequence file) output directory a file specifying the + user plugin class]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The job can be configured using the static methods in this class, + {@link DBInputFormat}, and {@link DBOutputFormat}. +

+ Alternatively, the properties can be set in the configuration with proper + values. + + @see DBConfiguration#configureDB(JobConf, String, String, String, String) + @see DBInputFormat#setInput(JobConf, Class, String, String) + @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...) + @see DBOutputFormat#setOutput(JobConf, String, String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 20070101 AND length > 0)' + @param orderBy the fieldNames in the orderBy clause. + @param fieldNames The field names in the table + @see #setInput(JobConf, Class, String, String)]]> + + + + + + + + + + + + + + DBInputFormat emits LongWritables containing the record number as + key and DBWritables as value. + + The SQL query, and input class can be using one of the two + setInput methods.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {@link DBOutputFormat} accepts <key,value> pairs, where + key has a type extending DBWritable. Returned {@link RecordWriter} + writes only the key to the database with a batch SQL query.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DBWritable. DBWritable, is similar to {@link Writable} + except that the {@link #write(PreparedStatement)} method takes a + {@link PreparedStatement}, and {@link #readFields(ResultSet)} + takes a {@link ResultSet}. +

+ Implementations are responsible for writing the fields of the object + to PreparedStatement, and reading the fields of the object from the + ResultSet. + +

Example:

+ If we have the following table in the database : +
+ CREATE TABLE MyTable (
+   counter        INTEGER NOT NULL,
+   timestamp      BIGINT  NOT NULL,
+ );
+ 
+ then we can read/write the tuples from/to the table with : +

+ public class MyWritable implements Writable, DBWritable {
+   // Some data     
+   private int counter;
+   private long timestamp;
+       
+   //Writable#write() implementation
+   public void write(DataOutput out) throws IOException {
+     out.writeInt(counter);
+     out.writeLong(timestamp);
+   }
+       
+   //Writable#readFields() implementation
+   public void readFields(DataInput in) throws IOException {
+     counter = in.readInt();
+     timestamp = in.readLong();
+   }
+       
+   public void write(PreparedStatement statement) throws SQLException {
+     statement.setInt(1, counter);
+     statement.setLong(2, timestamp);
+   }
+       
+   public void readFields(ResultSet resultSet) throws SQLException {
+     counter = resultSet.getInt(1);
+     timestamp = resultSet.getLong(2);
+   } 
+ }
+ 

]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When constructing the instance, if the factory property + contextName.class exists, + its value is taken to be the name of the class to instantiate. Otherwise, + the default is to create an instance of + org.apache.hadoop.metrics.spi.NullContext, which is a + dummy "no-op" context which will cause all metric data to be discarded. + + @param contextName the name of the context + @return the named MetricsContext]]> + + + + + + + + + + + + + + When the instance is constructed, this method checks if the file + hadoop-metrics.properties exists on the class path. If it + exists, it must be in the format defined by java.util.Properties, and all + the properties in the file are set as attributes on the newly created + ContextFactory instance. + + @return the singleton ContextFactory instance]]> + + + + getFactory() method.]]> + + + + + + + + + + + + + + + + + + + startMonitoring() again after calling + this. + @see #close()]]> + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record name identifies the kind of data to be reported. For example, a + program reporting statistics relating to the disks on a computer might use + a record name "diskStats".

+ + A record has zero or more tags. A tag has a name and a value. To + continue the example, the "diskStats" record might use a tag named + "diskName" to identify a particular disk. Sometimes it is useful to have + more than one tag, so there might also be a "diskType" with value "ide" or + "scsi" or whatever.

+ + A record also has zero or more metrics. These are the named + values that are to be reported to the metrics system. In the "diskStats" + example, possible metric names would be "diskPercentFull", "diskPercentBusy", + "kbReadPerSecond", etc.

+ + The general procedure for using a MetricsRecord is to fill in its tag and + metric values, and then call update() to pass the record to the + client library. + Metric data is not immediately sent to the metrics system + each time that update() is called. + An internal table is maintained, identified by the record name. This + table has columns + corresponding to the tag and the metric names, and rows + corresponding to each unique set of tag values. An update + either modifies an existing row in the table, or adds a new row with a set of + tag values that are different from all the other rows. Note that if there + are no tags, then there can be at most one row in the table.

+ + Once a row is added to the table, its data will be sent to the metrics system + on every timer period, whether or not it has been updated since the previous + timer period. If this is inappropriate, for example if metrics were being + reported by some transient object in an application, the remove() + method can be used to remove the row and thus stop the data from being + sent.

+ + Note that the update() method is atomic. This means that it is + safe for different threads to be updating the same metric. More precisely, + it is OK for different threads to call update() on MetricsRecord instances + with the same set of tag names and tag values. Different threads should + not use the same MetricsRecord instance at the same time.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MetricsContext.registerUpdater().]]> + + + + + + + + + + + + + + + + + + + + + + + + + fileName attribute, + if specified. Otherwise the data will be written to standard + output.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is configured by setting ContextFactory attributes which in turn + are usually configured through a properties file. All the attributes are + prefixed by the contextName. For example, the properties file might contain: +

+ myContextName.fileName=/tmp/metrics.log
+ myContextName.period=5
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contextName.tableName. The returned map consists of + those attributes with the contextName and tableName stripped off.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class implements the internal table of metric data, and the timer + on which data is to be sent to the metrics system. Subclasses must + override the abstract emitRecord method in order to transmit + the data.

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + update + and remove().]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hostname or hostname:port. If + the specs string is null, defaults to localhost:defaultPort. + + @return a list of InetSocketAddress objects.]]> + + + + + + + + + + + + + + + + + + + ,name=" + Where the and are the supplied parameters + + @param serviceName + @param nameName + @param theMbean - the MBean to register + @return the named used to register the MBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.rpc.socket.factory.class.<ClassName>. When no + such parameter exists then fall back on the default socket factory as + configured by hadoop.rpc.socket.factory.class.default. If + this default socket factory is not configured, then fall back on the JVM + default socket factory. + + @param conf the configuration + @param clazz the class (usually a {@link VersionedProtocol}) + @return a socket factory]]> + + + + + + hadoop.rpc.socket.factory.default + + @param conf the configuration + @return the default socket factory as specified in the configuration or + the JVM default socket factory if the configuration does not + contain a default socket factory property.]]> + + + + + + + + + + + + + : + ://:/]]> + + + + + + + + : + ://:/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketInputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getInputStream()} is returned. In the later + case, the timeout argument is ignored and the timeout set with + {@link Socket#setSoTimeout(int)} applies for reads.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see #getInputStream(Socket, long) + + @param socket + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + +
+ + From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketOutputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getOutputStream()} is returned. In the later + case, the timeout argument is ignored and the write will wait until + data is available.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see #getOutputStream(Socket, long) + + @param socket + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + node + + @param node + a node + @return true if node is already in the tree; false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scope + if scope starts with ~, choose one from the all nodes except for the + ones in scope; otherwise, choose one from scope + @param scope range of nodes from which a node will be choosen + @return the choosen node]]> + + + + + + + scope but not in excludedNodes + if scope starts with ~, return the number of nodes that are not + in scope and excludedNodes; + @param scope a path string that may start with ~ + @param excludedNodes a list of nodes + @return number of available nodes]]> + + + + + + + + + + + + reader + It linearly scans the array, if a local node is found, swap it with + the first element of the array. + If a local rack node is found, swap it with the first element following + the local node. + If neither local node or local rack node is found, put a random replica + location at postion 0. + It leaves the rest nodes untouched.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + +
+ + Create a new ouput stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = getCount(). + @param newCapacity The new capacity in bytes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index idx = startVector(...); + while (!idx.done()) { + .... // read element of a vector + idx.incr(); + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This task takes the given record definition files and compiles them into + java or c++ + files. It is then up to the user to compile the generated files. + +

The task requires the file or the nested fileset element to be + specified. Optional attributes are language (set the output + language, default is "java"), + destdir (name of the destination directory for generated java/c++ + code, default is ".") and failonerror (specifies error handling + behavior. default is true). +

Usage

+
+ <recordcc
+       destdir="${basedir}/gensrc"
+       language="java">
+   <fileset include="**\/*.jr" />
+ </recordcc>
+ 
]]> +
+
+ +
+ + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi as a comma separated string in + conf as a property attr + + The String starts with the user name followed by the default group names, + and other group names. + + @param conf configuration + @param attr property name + @param ugi a UnixUserGroupInformation]]> + + + + + + + + conf + + The object is expected to store with the property name attr + as a comma separated string that starts + with the user name followed by group names. + If the property name is not defined, return null. + It's assumed that there is only one UGI per user. If this user already + has a UGI in the ugi map, return the ugi in the map. + Otherwise, construct a UGI from the configuration, store it in the + ugi map and return it. + + @param conf configuration + @param attr property name + @return a UnixUGI + @throws LoginException if the stored string is ill-formatted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool supports archiving and anaylzing (sort/grep) of log-files. + It takes as input + a) Input uri which will serve uris of the logs to be archived. + b) Output directory (not mandatory). + b) Directory on dfs to archive the logs. + c) The sort/grep patterns for analyzing the files and separator for boundaries. + Usage: + Logalyzer -archive -archiveDir -analysis -logs -grep -sort -separator +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + in]]> + + + + + + + out.]]> + + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GenericOptionsParser to parse only the generic Hadoop + arguments. + + The array of string arguments other than the generic arguments can be + obtained by {@link #getRemainingArgs()}. + + @param conf the Configuration to modify. + @param args command-line arguments.]]> + + + + + GenericOptionsParser to parse given options as well + as generic Hadoop options. + + The resulting CommandLine object can be obtained by + {@link #getCommandLine()}. + + @param conf the configuration to modify + @param options options built by the caller + @param args User-specified arguments]]> + + + + + Strings containing the un-parsed arguments + or empty array if commandLine was not defined.]]> + + + + + CommandLine object + to process the parsed arguments. + + Note: If the object is created with + {@link #GenericOptionsParser(Configuration, String[])}, then returned + object will only contain parsed generic options. + + @return CommandLine representing list of arguments + parsed against Options descriptor.]]> + + + + + + + + + + + + + + + + + GenericOptionsParser is a utility to parse command line + arguments generic to the Hadoop framework. + + GenericOptionsParser recognizes several standarad command + line arguments, enabling applications to easily specify a namenode, a + jobtracker, additional configuration resources etc. + +

Generic Options

+ +

The supported generic options are:

+

+     -conf <configuration file>     specify a configuration file
+     -D <property=value>            use value for given property
+     -fs <local|namenode:port>      specify a namenode
+     -jt <local|jobtracker:port>    specify a job tracker
+     -files <comma separated list of files>    specify comma separated
+                            files to be copied to the map reduce cluster
+     -libjars <comma separated list of jars>   specify comma separated
+                            jar files to include in the classpath.
+     -archives <comma separated list of archives>    specify comma
+             separated archives to be unarchived on the compute machines.
+
+ 

+ +

The general command line syntax is:

+

+ bin/hadoop command [genericOptions] [commandOptions]
+ 

+ +

Generic command line arguments might modify + Configuration objects, given to constructors.

+ +

The functionality is implemented using Commons CLI.

+ +

Examples:

+

+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+ 
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+     
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+     
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+ 
+ $ bin/hadoop jar -libjars testlib.jar 
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+ 

+ + @see Tool + @see ToolRunner]]> +
+
+ + + + + + + + + Class<T>) of the + argument of type T. + @param The type of the argument + @param t the object to get it class + @return Class<T>]]> + + + + + + + List<T> to a an array of + T[]. + @param c the Class object of the items in the list + @param list the list to convert]]> + + + + + + List<T> to a an array of + T[]. + @param list the list to convert + @throws ArrayIndexOutOfBoundsException if the list is empty. + Use {@link #toArray(Class, List)} if the list may be empty.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + io.file.buffer.size specified in the given + Configuration. + @param in input stream + @param conf configuration + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-hadoop is loaded, + else false]]> + + + + + + true if native hadoop libraries, if present, can be + used for this job; false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { pq.top().change(); pq.adjustTop(); } + instead of
+  { o = pq.pop(); o.change(); pq.push(o); }
+ 
]]> +
+
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clients and/or applications can use the provided Progressable + to explicitly report progress to the Hadoop framework. This is especially + important for operations which take an insignificant amount of time since, + in-lieu of the reported progress, the framework has to assume that an error + has occured and time-out the operation.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Class is to be obtained + @return the correctly typed Class of the given object.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop Pipes + or Hadoop Streaming. + + It also checks to ensure that we are running on a *nix platform else + (e.g. in Cygwin/Windows) it returns null. + @param conf configuration + @return a String[] with the ulimit command arguments or + null if we are running on a non *nix platform or + if the limit is unspecified.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell interface. + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell can be used to run unix commands like du or + df. It also offers facilities to gate commands by + time-intervals.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ShellCommandExecutorshould be used in cases where the output + of the command needs no explicit parsing and where the command, working + directory and the environment remains unchanged. The output of the command + is stored as-is and is expected to be small.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ArrayList of string values]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the char to be escaped + @return an escaped string]]> + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the escaped char + @return an unescaped string]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tool, is the standard for any Map-Reduce tool/application. + The tool/application should delegate the handling of + + standard command-line options to {@link ToolRunner#run(Tool, String[])} + and only handle its custom arguments.

+ +

Here is how a typical Tool is implemented:

+

+     public class MyApp extends Configured implements Tool {
+     
+       public int run(String[] args) throws Exception {
+         // Configuration processed by ToolRunner
+         Configuration conf = getConf();
+         
+         // Create a JobConf using the processed conf
+         JobConf job = new JobConf(conf, MyApp.class);
+         
+         // Process custom command-line options
+         Path in = new Path(args[1]);
+         Path out = new Path(args[2]);
+         
+         // Specify various job-specific parameters     
+         job.setJobName("my-app");
+         job.setInputPath(in);
+         job.setOutputPath(out);
+         job.setMapperClass(MyApp.MyMapper.class);
+         job.setReducerClass(MyApp.MyReducer.class);
+
+         // Submit the job, then poll for progress until the job is complete
+         JobClient.runJob(job);
+       }
+       
+       public static void main(String[] args) throws Exception {
+         // Let ToolRunner handle generic command-line options 
+         int res = ToolRunner.run(new Configuration(), new Sort(), args);
+         
+         System.exit(res);
+       }
+     }
+ 

+ + @see GenericOptionsParser + @see ToolRunner]]> +
+
+ + + + + + + + + + + + Tool by {@link Tool#run(String[])}, after + parsing with the given generic arguments. Uses the given + Configuration, or builds one if null. + + Sets the Tool's configuration with the possibly modified + version of the conf. + + @param conf Configuration for the Tool. + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + Tool with its Configuration. + + Equivalent to run(tool.getConf(), tool, args). + + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + + + ToolRunner can be used to run classes implementing + Tool interface. It works in conjunction with + {@link GenericOptionsParser} to parse the + + generic hadoop command line arguments and modifies the + Configuration of the Tool. The + application-specific options are passed along without being modified. +

+ + @see Tool + @see GenericOptionsParser]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
diff --git a/lib/jdiff/hadoop_0.19.1.xml b/lib/jdiff/hadoop_0.19.1.xml new file mode 100644 index 00000000000..92bdd2c7996 --- /dev/null +++ b/lib/jdiff/hadoop_0.19.1.xml @@ -0,0 +1,44195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + final. + + @param name resource to be added, the classpath is examined for a file + with that name.]]> + + + + + + final. + + @param url url of the resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param file file-path of resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param in InputStream to deserialize the object from.]]> + + + + + + + + + + + name property, null if + no such property exists. + + Values are processed for variable expansion + before being returned. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + name property, without doing + variable expansion. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + + value of the name property. + + @param name property name. + @param value property value.]]> + + + + + + + name property. If no such property + exists, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value, or defaultValue if the property + doesn't exist.]]> + + + + + + + name property as an int. + + If no such property exists, or if the specified value is not a valid + int, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as an int, + or defaultValue.]]> + + + + + + + name property to an int. + + @param name property name. + @param value int value of the property.]]> + + + + + + + name property as a long. + If no such property is specified, or if the specified value is not a valid + long, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a long, + or defaultValue.]]> + + + + + + + name property to a long. + + @param name property name. + @param value long value of the property.]]> + + + + + + + name property as a float. + If no such property is specified, or if the specified value is not a valid + float, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a float, + or defaultValue.]]> + + + + + + + name property as a boolean. + If no such property is specified, or if the specified value is not a valid + boolean, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a boolean, + or defaultValue.]]> + + + + + + + name property to a boolean. + + @param name property name. + @param value boolean value of the property.]]> + + + + + + + + + + + + + name property as + a collection of Strings. + If no such property is specified then empty collection is returned. +

+ This is an optimized version of {@link #getStrings(String)} + + @param name property name. + @return property value as a collection of Strings.]]> + + + + + + name property as + an array of Strings. + If no such property is specified then null is returned. + + @param name property name. + @return property value as an array of Strings, + or null.]]> + + + + + + + name property as + an array of Strings. + If no such property is specified then default value is returned. + + @param name property name. + @param defaultValue The default value + @return property value as an array of Strings, + or default value.]]> + + + + + + + name property as + as comma delimited values. + + @param name property name. + @param values The values]]> + + + + + + + + + + + + + + name property + as an array of Class. + The value of the property specifies a list of comma separated class names. + If no such property is specified, then defaultValue is + returned. + + @param name the property name. + @param defaultValue default value. + @return property value as a Class[], + or defaultValue.]]> + + + + + + + name property as a Class. + If no such property is specified, then defaultValue is + returned. + + @param name the class name. + @param defaultValue default value. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property as a Class + implementing the interface specified by xface. + + If no such property is specified, then defaultValue is + returned. + + An exception is thrown if the returned class does not implement the named + interface. + + @param name the class name. + @param defaultValue default value. + @param xface the interface implemented by the named class. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property to the name of a + theClass implementing the given interface xface. + + An exception is thrown if theClass does not implement the + interface xface. + + @param name property name. + @param theClass property value. + @param xface the interface implemented by the named class.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + + + + + name. + + @param name configuration resource name. + @return an input stream attached to the resource.]]> + + + + + + name. + + @param name configuration resource name. + @return a reader attached to the resource.]]> + + + + + + + + + + + + + + + String + key-value pairs in the configuration. + + @return an iterator over the entries.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + true to set quiet-mode on, false + to turn it off.]]> + + + + + + + + + + + + + + + + + + + Resources + +

Configurations are specified by resources. A resource contains a set of + name/value pairs as XML data. Each resource is named by either a + String or by a {@link Path}. If named by a String, + then the classpath is examined for a file with that name. If named by a + Path, then the local filesystem is examined directly, without + referring to the classpath. + +

Unless explicitly turned off, Hadoop by default specifies two + resources, loaded in-order from the classpath:

    +
  1. hadoop-default.xml + : Read-only defaults for hadoop.
  2. +
  3. hadoop-site.xml: Site-specific configuration for a given hadoop + installation.
  4. +
+ Applications may add additional resources, which are loaded + subsequent to these resources in the order they are added. + +

Final Parameters

+ +

Configuration parameters may be declared final. + Once a resource declares a value final, no subsequently-loaded + resource can alter that value. + For example, one might define a final parameter with: +

+  <property>
+    <name>dfs.client.buffer.dir</name>
+    <value>/tmp/hadoop/dfs/client</value>
+    <final>true</final>
+  </property>
+ + Administrators typically define parameters as final in + hadoop-site.xml for values that user applications may not alter. + +

Variable Expansion

+ +

Value strings are first processed for variable expansion. The + available properties are:

    +
  1. Other properties defined in this Configuration; and, if a name is + undefined here,
  2. +
  3. Properties in {@link System#getProperties()}.
  4. +
+ +

For example, if a configuration resource contains the following property + definitions: +

+  <property>
+    <name>basedir</name>
+    <value>/user/${user.name}</value>
+  </property>
+  
+  <property>
+    <name>tempdir</name>
+    <value>${basedir}/tmp</value>
+  </property>
+ + When conf.get("tempdir") is called, then ${basedir} + will be resolved to another property in this Configuration, while + ${user.name} would then ordinarily be resolved to the value + of the System property with that name.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DistributedCache is a facility provided by the Map-Reduce + framework to cache files (text, archives, jars etc.) needed by applications. +

+ +

Applications specify the files, via urls (hdfs:// or http://) to be cached + via the {@link org.apache.hadoop.mapred.JobConf}. + The DistributedCache assumes that the + files specified via hdfs:// urls are already present on the + {@link FileSystem} at the path specified by the url.

+ +

The framework will copy the necessary files on to the slave node before + any tasks for the job are executed on that node. Its efficiency stems from + the fact that the files are only copied once per job and the ability to + cache archives which are un-archived on the slaves.

+ +

DistributedCache can be used to distribute simple, read-only + data/text files and/or more complex types such as archives, jars etc. + Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. + Jars may be optionally added to the classpath of the tasks, a rudimentary + software distribution mechanism. Files have execution permissions. + Optionally users can also direct it to symlink the distributed cache file(s) + into the working directory of the task.

+ +

DistributedCache tracks modification timestamps of the cache + files. Clearly the cache files should not be modified by the application + or externally while the job is executing.

+ +

Here is an illustrative example on how to use the + DistributedCache:

+

+     // Setting up the cache for the application
+     
+     1. Copy the requisite files to the FileSystem:
+     
+     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
+     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
+     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+     
+     2. Setup the application's JobConf:
+     
+     JobConf job = new JobConf();
+     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
+                                   job);
+     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+     
+     3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+     or {@link org.apache.hadoop.mapred.Reducer}:
+     
+     public static class MapClass extends MapReduceBase  
+     implements Mapper<K, V, K, V> {
+     
+       private Path[] localArchives;
+       private Path[] localFiles;
+       
+       public void configure(JobConf job) {
+         // Get the cached archives/files
+         localArchives = DistributedCache.getLocalCacheArchives(job);
+         localFiles = DistributedCache.getLocalCacheFiles(job);
+       }
+       
+       public void map(K key, V value, 
+                       OutputCollector<K, V> output, Reporter reporter) 
+       throws IOException {
+         // Use data from the cached archives/files here
+         // ...
+         // ...
+         output.collect(k, v);
+       }
+     }
+     
+ 

+ + @see org.apache.hadoop.mapred.JobConf + @see org.apache.hadoop.mapred.JobClient]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BufferedFSInputStream + with the specified buffer size, + and saves its argument, the input stream + in, for later use. An internal + buffer array of length size + is created and stored in buf. + + @param in the underlying input stream. + @param size the buffer size. + @exception IllegalArgumentException if size <= 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + setReplication of FileSystem + @param src file name + @param replication new replication + @throws IOException + @return true if successful; + false if file does not exist or is a directory]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ' + @deprecated Consider using {@link GenericOptionsParser} instead.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fs.scheme.class whose value names the FileSystem class. + The entire URI is passed to the FileSystem instance's initialize method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Return all the files that match filePattern and are not checksum + files. Results are sorted by their names. + +

+ A filename pattern is composed of regular characters and + special pattern matching characters, which are: + +

+
+
+

+

? +
Matches any single character. + +

+

* +
Matches zero or more characters. + +

+

[abc] +
Matches a single character from character set + {a,b,c}. + +

+

[a-b] +
Matches a single character from the character range + {a...b}. Note that character a must be + lexicographically less than or equal to character b. + +

+

[^a] +
Matches a single character that is not from character set or range + {a}. Note that the ^ character must occur + immediately to the right of the opening bracket. + +

+

\c +
Removes (escapes) any special meaning of character c. + +

+

{ab,cd} +
Matches a string from the string set {ab, cd} + +

+

{ab,c{de,fh}} +
Matches a string from the string set {ab, cde, cfh} + +
+
+
+ + @param pathPattern a regular expression specifying a pth pattern + + @return an array of paths that match the path pattern + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All user code that may potentially use the Hadoop Distributed + File System should be written to use a FileSystem object. The + Hadoop DFS is a multi-machine system that appears as a single + disk. It's useful because of its fault tolerance and potentially + very large capacity. + +

+ The local implementation is {@link LocalFileSystem} and distributed + implementation is DistributedFileSystem.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FilterFileSystem contains + some other file system, which it uses as + its basic file system, possibly transforming + the data along the way or providing additional + functionality. The class FilterFileSystem + itself simply overrides all methods of + FileSystem with versions that + pass all requests to the contained file + system. Subclasses of FilterFileSystem + may further override some of these methods + and may also provide additional methods + and fields.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + buf at offset + and checksum into checksum. + The method is used for implementing read, therefore, it should be optimized + for sequential reading + @param pos chunkPos + @param buf desitination buffer + @param offset offset in buf at which to store data + @param len maximun number of bytes to read + @return number of bytes read]]> + + + + + + + + + + + + + + + + + -1 if the end of the + stream is reached. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + This method implements the general contract of the corresponding + {@link InputStream#read(byte[], int, int) read} method of + the {@link InputStream} class. As an additional + convenience, it attempts to read as many bytes as possible by repeatedly + invoking the read method of the underlying stream. This + iterated read continues until one of the following + conditions becomes true:

    + +
  • The specified number of bytes have been read, + +
  • The read method of the underlying stream returns + -1, indicating end-of-file. + +
If the first read on the underlying stream returns + -1 to indicate end-of-file then this method returns + -1. Otherwise this method returns the number of bytes + actually read. + + @param b destination buffer. + @param off offset at which to start storing bytes. + @param len maximum number of bytes to read. + @return the number of bytes read, or -1 if the end of + the stream has been reached. + @exception IOException if an I/O error occurs. + ChecksumException if any checksum error occurs]]> +
+ + + + + + + + + + + + + + + + + + n bytes of data from the + input stream. + +

This method may skip more bytes than are remaining in the backing + file. This produces no exception and the number of bytes skipped + may include some number of bytes that were beyond the EOF of the + backing file. Attempting to read from the stream after skipping past + the end will result in -1 indicating the end of the file. + +

If n is negative, no bytes are skipped. + + @param n the number of bytes to be skipped. + @return the actual number of bytes skipped. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to skip to is corrupted]]> + + + + + + + This method may seek past the end of the file. + This produces no exception and an attempt to read from + the stream will result in -1 indicating the end of the file. + + @param pos the postion to seek to. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to seek to is corrupted]]> + + + + + + + + + + len bytes from + stm + + @param stm an input stream + @param buf destiniation buffer + @param offset offset at which to store data + @param len number of bytes to read + @return actual number of bytes read + @throws IOException if there is any IO error]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + len bytes from the specified byte array + starting at offset off and generate a checksum for + each data chunk. + +

This method stores bytes from the given array into this + stream's buffer before it gets checksumed. The buffer gets checksumed + and flushed to the underlying output stream when all data + in a checksum chunk are in the buffer. If the buffer is empty and + requested length is at least as large as the size of next checksum chunk + size, this method will checksum and write the chunk directly + to the underlying output stream. Thus it avoids uneccessary data copy. + + @param b the data. + @param off the start offset in the data. + @param len the number of bytes to write. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if and only if pathname + should be included]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + trash feature. Files are moved to a user's trash + directory, a subdirectory of their home directory named ".Trash". Files are + initially moved to a current sub-directory of the trash directory. + Within that sub-directory their original path is preserved. Periodically + one may checkpoint the current trash and remove older checkpoints. (This + design permits trash management without enumeration of the full trash + content, without date support in the filesystem, and without clock + synchronization.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} backed by an FTP client provided by Apache Commons Net. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is a tool for migrating data from an older to a newer version + of an S3 filesystem. +

+

+ All files in the filesystem are migrated by re-writing the block metadata + - no datafiles are touched. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + Extracts AWS credentials from the filesystem URI or configuration. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A block-based {@link FileSystem} backed by + Amazon S3. +

+ @see NativeS3FileSystem]]> +
+
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If f is a file, this method will make a single call to S3. + If f is a directory, this method will make a maximum of + (n / 1000) + 2 calls to S3, where n is the total number of + files and directories contained directly in f. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} for reading and writing files stored on + Amazon S3. + Unlike {@link org.apache.hadoop.fs.s3.S3FileSystem} this implementation + stores files on S3 in their + native form so they can be read by other S3 tools. +

+ @see org.apache.hadoop.fs.s3.S3FileSystem]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + @param name The name of the server + @param port The port to use on the server + @param findPort whether the server should start at the given port and + increment by 1 until it finds a free port. + @param conf Configuration]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points to the log directory + "/static/" -> points to common static files (src/webapps/static) + "/" -> the jsp server code from (src/webapps/)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nth value.]]> + + + + + + + + + + + + + + + + + + + + + nth value in the file.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + public class IntArrayWritable extends ArrayWritable { + public IntArrayWritable() { + super(IntWritable.class); + } + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a ByteWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataInputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using DataInput methods ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataOutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using DataOutput methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + the class of the item + @param conf the configuration to store + @param item the object to be stored + @param keyName the name of the key to use + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param items the objects to be stored + @param keyName the name of the key to use + @throws IndexOutOfBoundsException if the items array is empty + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + DefaultStringifier offers convenience methods to store/load objects to/from + the configuration. + + @param the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a DoubleWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a FloatWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When two sequence files, which have same Key type but different Value + types, are mapped out to reduce, multiple Value types is not allowed. + In this case, this class can help you wrap instances with different types. +

+ +

+ Compared with ObjectWritable, this class is much more effective, + because ObjectWritable will append the class declaration as a String + into the output file in every Key-Value pair. +

+ +

+ Generic Writable implements {@link Configurable} interface, so that it will be + configured by the framework. The configuration is passed to the wrapped objects + implementing {@link Configurable} interface before deserialization. +

+ + how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines + the classes which will be wrapped in GenericObject in application. + Attention: this classes defined in getTypes() method, must + implement Writable interface. +

+ + The code looks like this: +
+ public class GenericObject extends GenericWritable {
+ 
+   private static Class[] CLASSES = {
+               ClassType1.class, 
+               ClassType2.class,
+               ClassType3.class,
+               };
+
+   protected Class[] getTypes() {
+       return CLASSES;
+   }
+
+ }
+ 
+ + @since Nov 8, 2006]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new InputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using InputStream methods ...
+ }
+ 
+ @see DataInputBuffer + @see DataOutput]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a IntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + closes the input and output streams + at the end. + @param in InputStrem to read from + @param out OutputStream to write to + @param conf the Configuration object]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ignore any {@link IOException} or + null pointers. Must only be used for cleanup in exception handlers. + @param log the log to record problems to at debug level. Can be null. + @param closeables the objects to close]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a LongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A map is a directory containing two files, the data file, + containing all keys and values in the map, and a smaller index + file, containing a fraction of the keys. The fraction is determined by + {@link Writer#getIndexInterval()}. + +

The index file is read entirely into memory. Thus key implementations + should try to keep themselves small. + +

Map files are created by adding entries in-order. To maintain a large + database, perform updates by copying the previous version of a database and + merging in a sorted change list, to create a new version of the database in + a new file. Sorting large change lists can be done with {@link + SequenceFile.Sorter}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key and + val. Returns true if such a pair exists and false when at + the end of the map]]> + + + + + + + + + + + + + + + + key or if it does not exist, at the first entry + after the named key. + +- * @param key - key that we're trying to find +- * @param val - data value if key is found +- * @return - the key that was the closest match or null if eof.]]> + + + + + + + + + key does not exist, return + the first entry that falls just before the key. Otherwise, + return the record that sorts just after. + @return - the key that was the closest match or null if eof.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is an MD5Hash whose digest contains the + same values.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new OutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ OutputBuffer buffer = new OutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using OutputStream methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
+ @see DataOutputBuffer + @see InputBuffer]]> +
+
+ + + + + + + + + + + + + + + A {@link Comparator} that operates directly on byte representations of + objects. +

+ @param + @see DeserializerComparator]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SequenceFiles are flat files consisting of binary key/value + pairs. + +

SequenceFile provides {@link Writer}, {@link Reader} and + {@link Sorter} classes for writing, reading and sorting respectively.

+ + There are three SequenceFile Writers based on the + {@link CompressionType} used to compress key/value pairs: +
    +
  1. + Writer : Uncompressed records. +
  2. +
  3. + RecordCompressWriter : Record-compressed files, only compress + values. +
  4. +
  5. + BlockCompressWriter : Block-compressed files, both keys & + values are collected in 'blocks' + separately and compressed. The size of + the 'block' is configurable. +
+ +

The actual compression algorithm used to compress key and/or values can be + specified by using the appropriate {@link CompressionCodec}.

+ +

The recommended way is to use the static createWriter methods + provided by the SequenceFile to chose the preferred format.

+ +

The {@link Reader} acts as the bridge and can read any of the above + SequenceFile formats.

+ +

SequenceFile Formats

+ +

Essentially there are 3 different formats for SequenceFiles + depending on the CompressionType specified. All of them share a + common header described below. + +

+
    +
  • + version - 3 bytes of magic header SEQ, followed by 1 byte of actual + version number (e.g. SEQ4 or SEQ6) +
  • +
  • + keyClassName -key class +
  • +
  • + valueClassName - value class +
  • +
  • + compression - A boolean which specifies if compression is turned on for + keys/values in this file. +
  • +
  • + blockCompression - A boolean which specifies if block-compression is + turned on for keys/values in this file. +
  • +
  • + compression codec - CompressionCodec class which is used for + compression of keys and/or values (if compression is + enabled). +
  • +
  • + metadata - {@link Metadata} for this file. +
  • +
  • + sync - A sync marker to denote end of the header. +
  • +
+ +
Uncompressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Record-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Compressed Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Block-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record Block +
      +
    • Compressed key-lengths block-size
    • +
    • Compressed key-lengths block
    • +
    • Compressed keys block-size
    • +
    • Compressed keys block
    • +
    • Compressed value-lengths block-size
    • +
    • Compressed value-lengths block
    • +
    • Compressed values block-size
    • +
    • Compressed values block
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +

The compressed blocks of key lengths and value lengths consist of the + actual lengths of individual keys/values encoded in ZeroCompressedInteger + format.

+ + @see CompressionCodec]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key, skipping its + value. True if another entry exists, and false at end of file.]]> + + + + + + + + key and + val. Returns true if such a pair exists and false when at + end of file]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The position passed must be a position returned by {@link + SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary + position, use {@link SequenceFile.Reader#sync(long)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SegmentDescriptor + @param segments the list of SegmentDescriptors + @param tmpDir the directory to write temporary files into + @return RawKeyValueIterator + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For best performance, applications should make sure that the {@link + Writable#readFields(DataInput)} implementation of their keys is + very efficient. In particular, it should avoid allocating memory.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This always returns a synchronized position. In other words, + immediately after calling {@link SequenceFile.Reader#seek(long)} with a position + returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However + the key may be earlier in the file than key last written when this + method was called (e.g., with block-compression, it may be the first key + in the block that was being written when this method was called).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key. Returns + true if such a key exists and false when at the end of the set.]]> + + + + + + + key. + Returns key, or null if no match exists.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + position. Note that this + method avoids using the converter or doing String instatiation + @return the Unicode scalar value at position or -1 + if the position is invalid or points to a + trailing byte]]> + + + + + + + + + + what in the backing + buffer, starting as position start. The starting + position is measured in bytes and the return value is in + terms of byte position in the buffer. The backing buffer is + not converted to a string for this operation. + @return byte position of the first occurence of the search + string in the UTF-8 buffer or -1 if not found]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a Text with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException.]]> + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException. + @return ByteBuffer: bytes stores at ByteBuffer.array() + and length is ByteBuffer.limit()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In + addition, it provides methods for string traversal without converting the + byte array to a string.

Also includes utilities for + serializing/deserialing a string, coding/decoding a string, checking if a + byte array contains valid UTF8 code, calculating the length of an encoded + string.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a UTF8 with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + Also includes utilities for efficiently reading and writing UTF-8. + + @deprecated replaced by Text]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This is useful when a class may evolve, so that instances written by the + old version of the class may still be processed by the new version. To + handle this situation, {@link #readFields(DataInput)} + implementations should catch {@link VersionMismatchException}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VIntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VLongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + out. + + @param out DataOuput to serialize this object into. + @throws IOException]]> + + + + + + + in. + +

For efficiency, implementations should attempt to re-use storage in the + existing object where possible.

+ + @param in DataInput to deseriablize this object from. + @throws IOException]]> +
+ + + Any key or value type in the Hadoop Map-Reduce + framework implements this interface.

+ +

Implementations typically implement a static read(DataInput) + method which constructs a new instance, calls {@link #readFields(DataInput)} + and returns the instance.

+ +

Example:

+

+     public class MyWritable implements Writable {
+       // Some data     
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public static MyWritable read(DataInput in) throws IOException {
+         MyWritable w = new MyWritable();
+         w.readFields(in);
+         return w;
+       }
+     }
+ 

]]> +
+ + + + + + + + WritableComparables can be compared to each other, typically + via Comparators. Any type which is to be used as a + key in the Hadoop Map-Reduce framework should implement this + interface.

+ +

Example:

+

+     public class MyWritableComparable implements WritableComparable {
+       // Some data
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public int compareTo(MyWritableComparable w) {
+         int thisValue = this.value;
+         int thatValue = ((IntWritable)o).value;
+         return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+       }
+     }
+ 

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The default implementation reads the data into two {@link + WritableComparable}s (using {@link + Writable#readFields(DataInput)}, then calls {@link + #compare(WritableComparable,WritableComparable)}.]]> + + + + + + + The default implementation uses the natural ordering, calling {@link + Comparable#compareTo(Object)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This base implemenation uses the natural ordering. To define alternate + orderings, override {@link #compare(WritableComparable,WritableComparable)}. + +

One may optimize compare-intensive operations by overriding + {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are + provided to assist in optimized implementations of this method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum type + @param in DataInput to read from + @param enumType Class type of Enum + @return Enum represented by String read from DataInput + @throws IOException]]> + + + + + + + + + + + + + + + + len number of bytes in input streamin + @param in input stream + @param len number of bytes to skip + @throws IOException when skipped less number of bytes]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CompressionCodec for which to get the + Compressor + @return Compressor for the given + CompressionCodec from the pool or a new one]]> + + + + + + CompressionCodec for which to get the + Decompressor + @return Decompressor for the given + CompressionCodec the pool or a new one]]> + + + + + + Compressor to be returned to the pool]]> + + + + + + Decompressor to be returned to the + pool]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Implementations are assumed to be buffered. This permits clients to + reposition the underlying input stream then call {@link #resetState()}, + without having to also synchronize client buffers.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + true if a preset dictionary is needed for decompression. + @return true if a preset dictionary is needed for decompression]]> + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-lzo library is loaded & initialized; + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair. + http://www.oberhumer.com/opensource/lzo/]]> + + + + + + + + + + + + + + + + + + + + + + + lzo compression/decompression pair compatible with lzop. + http://www.lzop.org/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FIXME: This array should be in a private or package private location, + since it could be modified by malicious code. +

]]> +
+ + + + This interface is public for historical purposes. You should have no need to + use it. +

]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Although BZip2 headers are marked with the magic "Bz" this + constructor expects the next byte in the stream to be the first one after + the magic. Thus callers have to skip the first two bytes. Otherwise this + constructor will throw an exception. +

+ + @throws IOException + if the stream content is malformed or an I/O error occurs. + @throws NullPointerException + if in == null]]> +
+
+ + + + + + + + + + + + + + + The decompression requires large amounts of memory. Thus you should call the + {@link #close() close()} method as soon as possible, to force + CBZip2InputStream to release the allocated memory. See + {@link CBZip2OutputStream CBZip2OutputStream} for information about memory + usage. +

+ +

+ CBZip2InputStream reads bytes from the compressed source stream via + the single byte {@link java.io.InputStream#read() read()} method exclusively. + Thus you should consider to use a buffered source stream. +

+ +

+ Instances of this class are not threadsafe. +

]]> +
+
+ + + + + + + + CBZip2OutputStream with a blocksize of 900k. + +

+ Attention: The caller is resonsible to write the two BZip2 magic + bytes "BZ" to the specified stream prior to calling this + constructor. +

+ + @param out * + the destination stream. + + @throws IOException + if an I/O error occurs in the specified stream. + @throws NullPointerException + if out == null.]]> +
+
+ + + + CBZip2OutputStream with specified blocksize. + +

+ Attention: The caller is resonsible to write the two BZip2 magic + bytes "BZ" to the specified stream prior to calling this + constructor. +

+ + + @param out + the destination stream. + @param blockSize + the blockSize as 100k units. + + @throws IOException + if an I/O error occurs in the specified stream. + @throws IllegalArgumentException + if (blockSize < 1) || (blockSize > 9). + @throws NullPointerException + if out == null. + + @see #MIN_BLOCKSIZE + @see #MAX_BLOCKSIZE]]> +
+
+ + + + + + + + + + + + + inputLength this method returns MAX_BLOCKSIZE + always. + + @param inputLength + The length of the data which will be compressed by + CBZip2OutputStream.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + == 1.]]> + + + + + == 9.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If you are ever unlucky/improbable enough to get a stack overflow whilst + sorting, increase the following constant and try again. In practice I + have never seen the stack go above 27 elems, so the following limit seems + very generous. +

]]> +
+
+ + + The compression requires large amounts of memory. Thus you should call the + {@link #close() close()} method as soon as possible, to force + CBZip2OutputStream to release the allocated memory. +

+ +

+ You can shrink the amount of allocated memory and maybe raise the compression + speed by choosing a lower blocksize, which in turn may cause a lower + compression ratio. You can avoid unnecessary memory allocation by avoiding + using a blocksize which is bigger than the size of the input. +

+ +

+ You can compute the memory usage for compressing by the following formula: +

+ +
+ <code>400k + (9 * blocksize)</code>.
+ 
+ +

+ To get the memory required for decompression by {@link CBZip2InputStream + CBZip2InputStream} use +

+ +
+ <code>65k + (5 * blocksize)</code>.
+ 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Memory usage by blocksize
Blocksize Compression
+ memory usage
Decompression
+ memory usage
100k1300k565k
200k2200k1065k
300k3100k1565k
400k4000k2065k
500k4900k2565k
600k5800k3065k
700k6700k3565k
800k7600k4065k
900k8500k4565k
+ +

+ For decompression CBZip2InputStream allocates less memory if the + bzipped input is smaller than one block. +

+ +

+ Instances of this class are not threadsafe. +

+ +

+ TODO: Update to BZip2 1.0.1 +

]]> +
+
+ +
+ + + + + + + + + + + + + + + + + true if lzo compressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if lzo decompressors are loaded & initialized, + else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-zlib is loaded & initialized + and can be loaded for this job, else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keep trying a limited number of times, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying for a maximum time, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by the number of tries so far. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by a random + number in the range of [0, 2 to the number of retries) +

]]> +
+
+ + + + + + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + + + A retry policy for RemoteException + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + Try once, and fail by re-throwing the exception. + This corresponds to having no retry mechanism in place. +

]]> +
+
+ + + + Try once, and fail silently for void methods, or by + re-throwing the exception for non-void methods. +

]]> +
+
+ + + + Keep trying forever. +

]]> +
+
+ + + A collection of useful implementations of {@link RetryPolicy}. +

]]> +
+
+ + + + + + + + + + Determines whether the framework should retry a + method for the given exception, and the number + of retries that have been made for that operation + so far. +

+ @param e The exception that caused the method to fail. + @param retries The number of times the method has been retried. + @return true if the method should be retried, + false if the method should not be retried + but shouldn't fail with an exception (only for void methods). + @throws Exception The re-thrown exception e indicating + that the method failed and should not be retried further.]]> +
+
+ + + Specifies a policy for retrying method failures. + Implementations of this interface should be immutable. +

]]> +
+
+ + + + + + + + + + + + Create a proxy for an interface of an implementation class + using the same retry policy for each method in the interface. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param retryPolicy the policy for retirying method call failures + @return the retry proxy]]> +
+
+ + + + + + + Create a proxy for an interface of an implementation class + using the a set of retry policies specified by method name. + If no retry policy is defined for a method then a default of + {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param methodNameToPolicyMap a map of method names to retry policies + @return the retry proxy]]> +
+
+ + + A factory for creating retry proxies. +

]]> +
+
+ +
+ + + + + + + + Prepare the deserializer for reading.

]]> +
+
+ + + + + + Deserialize the next object from the underlying input stream. + If the object t is non-null then this deserializer + may set its internal state to the next object read from the input + stream. Otherwise, if the object t is null a new + deserialized object will be created. +

+ @return the deserialized object]]> +
+
+ + + + Close the underlying input stream and clear up any resources.

]]> +
+
+ + + Provides a facility for deserializing objects of type from an + {@link InputStream}. +

+ +

+ Deserializers are stateful, but must not buffer the input since + other producers may read from the input between calls to + {@link #deserialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + A {@link RawComparator} that uses a {@link Deserializer} to deserialize + the objects to be compared so that the standard {@link Comparator} can + be used to compare them. +

+

+ One may optimize compare-intensive operations by using a custom + implementation of {@link RawComparator} that operates directly + on byte representations. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + An experimental {@link Serialization} for Java {@link Serializable} classes. +

+ @see JavaSerializationComparator]]> +
+
+ + + + + + + + + + + + + A {@link RawComparator} that uses a {@link JavaSerialization} + {@link Deserializer} to deserialize objects that are then compared via + their {@link Comparable} interfaces. +

+ @param + @see JavaSerialization]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + Encapsulates a {@link Serializer}/{@link Deserializer} pair. +

+ @param ]]> +
+
+ + + + + + + Serializations are found by reading the io.serializations + property from conf, which is a comma-delimited list of + classnames. +

]]> +
+
+ + + + + + + + + + + + A factory for {@link Serialization}s. +

]]> +
+
+ + + + + + + + Prepare the serializer for writing.

]]> +
+
+ + + + + Serialize t to the underlying output stream.

]]> +
+
+ + + + Close the underlying output stream and clear up any resources.

]]> +
+
+ + + Provides a facility for serializing objects of type to an + {@link OutputStream}. +

+ +

+ Serializers are stateful, but must not buffer the output since + other producers may write to the output between calls to + {@link #serialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param, to the IPC server running at + address, returning the value. Throws exceptions if there are + network problems or if the remote code threw an exception.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unwraps any IOException. + + @param lookupTypes the desired exception class. + @return IOException, which is either the lookupClass exception or this.]]> + + + + + This unwraps any Throwable that has a constructor taking + a String as a parameter. + Otherwise it returns this. + + @return Throwable]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + protocol is a Java interface. All parameters and return types must + be one of: + +
  • a primitive type, boolean, byte, + char, short, int, long, + float, double, or void; or
  • + +
  • a {@link String}; or
  • + +
  • a {@link Writable}; or
  • + +
  • an array of the above types
+ + All methods in the protocol should throw only IOException. No field data of + the protocol instance is transmitted.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + handlerCount determines + the number of handler threads that will be used to process calls.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #rpcQueueTime}.inc(time)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker, + as {@link JobTracker.State} + + @return the current state of the JobTracker.]]> + + + + + + + + + + + + ClusterStatus provides clients with information such as: +

    +
  1. + Size of the cluster. +
  2. +
  3. + Task capacity of the cluster. +
  4. +
  5. + The number of currently running map & reduce tasks. +
  6. +
  7. + State of the JobTracker. +
  8. +

+ +

Clients can query for the latest ClusterStatus, via + {@link JobClient#getClusterStatus()}.

+ + @see JobClient]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter can be of + any {@link Enum} type.

+ +

Counters are bunched into {@link Group}s, each comprising of + counters from a particular Enum class.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of counters, comprising of counters from a particular + counter {@link Enum} class. + +

Grouphandles localization of the class name and the + counter names.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param fs the file system that the file is on + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides a generic implementation of + {@link #getSplits(JobConf, int)}. + Subclasses of FileInputFormat can also override the + {@link #isSplitable(FileSystem, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Note: The following is valid only if the {@link OutputCommitter} + is {@link FileOutputCommitter}. If OutputCommitter is not + a FileOutputCommitter, the task's temporary output + directory is same as {@link #getOutputPath(JobConf)} i.e. + ${mapred.output.dir}$

+ +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the attemptid, say + attempt_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in ${mapred.work.output.dir} during execution + of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the + framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

Note: the value of ${mapred.work.output.dir} during + execution of a particular task-attempt is actually + ${mapred.output.dir}/_temporary/_{$taskid}, and this value is + set by the map-reduce framework. So, just create any side-files in the + path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce + task to take advantage of this feature.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+
+ + + + + + + + + + + + + The generated name can be used to create custom files from within the + different tasks for the job, the names for different tasks will not collide + with each other.

+ +

The given name is postfixed with the task type, 'm' for maps, 'r' for + reduces and the task partition number. For example, give a name 'test' + running on the first map o the job the generated name will be + 'test-m-00000'.

+ + @param conf the configuration for the job. + @param name the name to make unique. + @return a unique name accross all tasks of the job.]]> +
+
+ + + + + The path can be used to create custom files from within the map and + reduce tasks. The path name will be unique for each task. The path parent + will be the job output directory.

ls + +

This method uses the {@link #getUniqueName} method to make the file name + unique for the task.

+ + @param conf the configuration for the job. + @param name the name for the file. + @return a unique path accross all tasks of the job.]]> +
+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. + + @param job job configuration. + @param numSplits the desired number of splits, a hint. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + It is the responsibility of the RecordReader to respect + record boundaries while processing the logical split to present a + record-oriented view to the individual task.

+ + @param split the {@link InputSplit} + @param job the job that this split belongs to + @return a {@link RecordReader}]]> +
+
+ + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibilty to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see JobClient + @see FileInputFormat]]> + + + + + + + + + + InputSplit. + + @return the number of bytes in the input split. + @throws IOException]]> + + + + + + InputSplit is + located as an array of Strings. + @throws IOException]]> + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + jobid doesn't correspond to any known job. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient is the primary interface for the user-job to interact + with the {@link JobTracker}. + + JobClient provides facilities to submit jobs, track their + progress, access component-tasks' reports/logs, get the Map-Reduce cluster + status information etc. + +

The job submission process involves: +

    +
  1. + Checking the input and output specifications of the job. +
  2. +
  3. + Computing the {@link InputSplit}s for the job. +
  4. +
  5. + Setup the requisite accounting information for the {@link DistributedCache} + of the job, if necessary. +
  6. +
  7. + Copying the job's jar and configuration to the map-reduce system directory + on the distributed file-system. +
  8. +
  9. + Submitting the job to the JobTracker and optionally monitoring + it's status. +
  10. +

+ + Normally the user creates the application, describes various facets of the + job via {@link JobConf} and then uses the JobClient to submit + the job and monitor its progress. + +

Here is an example on how to use JobClient:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     job.setInputPath(new Path("in"));
+     job.setOutputPath(new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+
+     // Submit the job, then poll for progress until the job is complete
+     JobClient.runJob(job);
+ 

+ +

Job Control

+ +

At times clients would chain map-reduce jobs to accomplish complex tasks + which cannot be done via a single map-reduce job. This is fairly easy since + the output of the job, typically, goes to distributed file-system and that + can be used as the input for the next job.

+ +

However, this also means that the onus on ensuring jobs are complete + (success/failure) lies squarely on the clients. In such situations the + various job-control options are: +

    +
  1. + {@link #runJob(JobConf)} : submits the job and returns only after + the job has completed. +
  2. +
  3. + {@link #submitJob(JobConf)} : only submits the job, then poll the + returned handle to the {@link RunningJob} to query status and make + scheduling decisions. +
  4. +
  5. + {@link JobConf#setJobEndNotificationURI(String)} : setup a notification + on job-completion, thus avoiding polling. +
  6. +

+ + @see JobConf + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If the parameter {@code loadDefaults} is false, the new instance + will not load resources from the default files. + + @param loadDefaults specifies whether to load from the default files]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if framework should keep the intermediate files + for failed tasks, false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the outputs of the maps are to be compressed, + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This comparator should be provided if the equivalence rules for keys + for sorting the intermediates are different from those for grouping keys + before each call to + {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ +

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + in a single call to the reduce function if K1 and K2 compare as equal.

+ +

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + how keys are sorted, this can be used in conjunction to simulate + secondary sort on values.

+ +

Note: This is not a guarantee of the reduce sort being + stable in any sense. (In any case, with the order of available + map-outputs to the reduce being non-deterministic, it wouldn't make + that much sense.)

+ + @param theClass the comparator class to be used for grouping keys. + It should implement RawComparator. + @see #setOutputKeyComparatorClass(Class)]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. Typically the combiner is same as the + the {@link Reducer} for the job i.e. {@link #getReducerClass()}. + + @return the user-defined combiner class used to combine map-outputs.]]> + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. + +

The combiner is a task-level aggregation operation which, in some cases, + helps to cut down the amount of data transferred from the {@link Mapper} to + the {@link Reducer}, leading to better performance.

+ +

Typically the combiner is same as the the Reducer for the + job i.e. {@link #setReducerClass(Class)}.

+ + @param theClass the user-defined combiner class used to combine + map-outputs.]]> +
+
+ + + true. + + @return true if speculative execution be used for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on, else false.]]> + + + + + true. + + @return true if speculative execution be + used for this job for map tasks, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for map tasks, + else false.]]> + + + + + true. + + @return true if speculative execution be used + for reduce tasks for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for reduce tasks, + else false.]]> + + + + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + Note: This is only a hint to the framework. The actual + number of spawned map tasks depends on the number of {@link InputSplit}s + generated by the job's {@link InputFormat#getSplits(JobConf, int)}. + + A custom {@link InputFormat} is typically used to accurately control + the number of map tasks for the job.

+ +

How many maps?

+ +

The number of maps is usually driven by the total size of the inputs + i.e. total number of blocks of the input files.

+ +

The right level of parallelism for maps seems to be around 10-100 maps + per-node, although it has been set up to 300 or so for very cpu-light map + tasks. Task setup takes awhile, so it is best if the maps take at least a + minute to execute.

+ +

The default behavior of file-based {@link InputFormat}s is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of input files. However, the {@link FileSystem} blocksize of the + input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Thus, if you expect 10TB of input data and have a blocksize of 128MB, + you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is + used to set it even higher.

+ + @param n the number of map tasks for this job. + @see InputFormat#getSplits(JobConf, int) + @see FileInputFormat + @see FileSystem#getDefaultBlockSize() + @see FileStatus#getBlockSize()]]> +
+
+ + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + How many reduces? + +

The right number of reduces seems to be 0.95 or + 1.75 multiplied by (<no. of nodes> * + + mapred.tasktracker.reduce.tasks.maximum). +

+ +

With 0.95 all of the reduces can launch immediately and + start transfering map outputs as the maps finish. With 1.75 + the faster nodes will finish their first round of reduces and launch a + second wave of reduces doing a much better job of load balancing.

+ +

Increasing the number of reduces increases the framework overhead, but + increases load balancing and lowers the cost of failures.

+ +

The scaling factors above are slightly less than whole numbers to + reserve a few reduce slots in the framework for speculative-tasks, failures + etc.

+ +

Reducer NONE

+ +

It is legal to set the number of reduce-tasks to zero.

+ +

In this case the output of the map-tasks directly go to distributed + file-system, to the path set by + {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the + framework doesn't sort the map-outputs before writing it out to HDFS.

+ + @param n the number of reduce tasks for this job.]]> +
+
+ + + mapred.map.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per map task.]]> + + + + + + + + + + + mapred.reduce.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per reduce task.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noFailures, the + tasktracker is blacklisted for this job. + + @param noFailures maximum no. of failures of a given job per tasktracker.]]> + + + + + blacklisted for this job. + + @return the maximum no. of failures of a given job per tasktracker.]]> + + + + + failed. + + Defaults to zero, i.e. any failed map-task results in + the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + failed. + + Defaults to zero, i.e. any failed reduce-task results + in the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The debug script can aid debugging of failed map tasks. The script is + given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script needs to be symlinked.

+ +

Here is an example on how to submit a script +

+ job.setMapDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param mDbgScript the script name]]> +
+
+ + + + + + + + + The debug script can aid debugging of failed reduce tasks. The script + is given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script file needs to be symlinked

+ +

Here is an example on how to submit a script +

+ job.setReduceDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param rDbgScript the script name]]> +
+
+ + + + + + + + null if it hasn't + been set. + @see #setJobEndNotificationURI(String)]]> + + + + + + The uri can contain 2 special parameters: $jobId and + $jobStatus. Those, if present, are replaced by the job's + identifier and completion-status respectively.

+ +

This is typically used by application-writers to implement chaining of + Map-Reduce jobs in an asynchronous manner.

+ + @param uri the job end notification uri + @see JobStatus + @see Job Completion and Chaining]]> +
+
+ + + + When a job starts, a shared directory is created at location + + ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ . + This directory is exposed to the users through + job.local.dir . + So, the tasks can use this space + as scratch space and share files among them.

+ This value is available as System property also. + + @return The localized job specific shared directory]]> +
+
+ + + + + + + + + + + + + + + + + + JobConf is the primary interface for a user to describe a + map-reduce job to the Hadoop framework for execution. The framework tries to + faithfully execute the job as-is described by JobConf, however: +
    +
  1. + Some configuration parameters might have been marked as + + final by administrators and hence cannot be altered. +
  2. +
  3. + While some job parameters are straight-forward to set + (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly + rest of the framework and/or job-configuration and is relatively more + complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}). +
  4. +

+ +

JobConf typically specifies the {@link Mapper}, combiner + (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and + {@link OutputFormat} implementations to be used etc. + +

Optionally JobConf is used to specify other advanced facets + of the job such as Comparators to be used, files to be put in + the {@link DistributedCache}, whether or not intermediate and/or job outputs + are to be compressed (and how), debugability via user-provided scripts + ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), + for doing post-processing on task logs, task's stdout, stderr, syslog. + and etc.

+ +

Here is an example on how to configure a job via JobConf:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     FileInputFormat.setInputPaths(job, new Path("in"));
+     FileOutputFormat.setOutputPath(job, new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setCombinerClass(MyJob.MyReducer.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+     
+     job.setInputFormat(SequenceFileInputFormat.class);
+     job.setOutputFormat(SequenceFileOutputFormat.class);
+ 

+ + @see JobClient + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + any job + run on the jobtracker started at 200707121733, we would use : +
 
+ JobID.getTaskIDsPattern("200707121733", null);
+ 
+ which will return : +
 "job_200707121733_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @return a regex pattern matching JobIDs]]> +
+
+ + + An example JobID is : + job_200707121733_0003 , which represents the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse JobID strings, but rather + use appropriate constructors or {@link #forName(String)} method. + + @see TaskID + @see TaskAttemptID + @see JobTracker#getNewJobId() + @see JobTracker#getStartTime()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + "N/A" + + @return Scheduling information associated to particular Job Queue]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -archives + -files inputjar args]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + zero. + + @param conf configuration for the JobTracker. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Output pairs need not be of the same types as input pairs. A given + input pair may map to zero or many output pairs. Output pairs are + collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the input key. + @param value the input value. + @param output collects mapped keys and values. + @param reporter facility to report progress.]]> +
+ + + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link JobConf} for the + job via the {@link JobConfigurable#configure(JobConf)} and initialize + themselves. Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

The framework then calls + {@link #map(Object, Object, OutputCollector, Reporter)} + for each key/value pair in the InputSplit for that task.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the grouping by specifying + a Comparator via + {@link JobConf#setOutputKeyComparatorClass(Class)}.

+ +

The grouped Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

The intermediate, grouped outputs are always stored in + {@link SequenceFile}s. Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the JobConf.

+ +

If the job has + zero + reduces then the output of the Mapper is directly written + to the {@link FileSystem} without grouping by keys.

+ +

Example:

+

+     public class MyMapper<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Mapper<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+       
+       private String mapTaskId;
+       private String inputFile;
+       private int noRecords = 0;
+       
+       public void configure(JobConf job) {
+         mapTaskId = job.get("mapred.task.id");
+         inputFile = job.get("mapred.input.file");
+       }
+       
+       public void map(K key, V val,
+                       OutputCollector<K, V> output, Reporter reporter)
+       throws IOException {
+         // Process the <key, value> pair (assume this takes a while)
+         // ...
+         // ...
+         
+         // Let the framework know that we are alive, and kicking!
+         // reporter.progress();
+         
+         // Process some more
+         // ...
+         // ...
+         
+         // Increment the no. of <key, value> pairs processed
+         ++noRecords;
+
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+        
+         // Every 100 records update application-level status
+         if ((noRecords%100) == 0) {
+           reporter.setStatus(mapTaskId + " processed " + noRecords + 
+                              " from input-file: " + inputFile); 
+         }
+         
+         // Output the result
+         output.collect(key, val);
+       }
+     }
+ 

+ +

Applications may write a custom {@link MapRunnable} to exert greater + control on map processing e.g. multi-threaded Mappers etc.

+ + @see JobConf + @see InputFormat + @see Partitioner + @see Reducer + @see MapReduceBase + @see MapRunnable + @see SequenceFile]]> +
+
+ + + + + + + + + + + + + + + + + + + + + Provides default no-op implementations for a few methods, most non-trivial + applications need to override some of them.

]]> +
+
+ + + + + + + + + + + <key, value> pairs. + +

Mapping of input records to output records is complete when this method + returns.

+ + @param input the {@link RecordReader} to read the input records. + @param output the {@link OutputCollector} to collect the outputrecords. + @param reporter {@link Reporter} to report progress, status-updates etc. + @throws IOException]]> +
+
+ + Custom implementations of MapRunnable can exert greater + control on map processing e.g. multi-threaded, asynchronous mappers etc.

+ + @see Mapper]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly + equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)} + to construct RecordReader's for MultiFileSplit's. + @see MultiFileSplit]]> +
+
+ + + + + + + + + + + + + + + + + th Path]]> + + + + + + + + + + + th Path]]> + + + + + + + + + + + + + + + + + + + + + + + MultiFileSplit can be used to implement {@link RecordReader}'s, with + reading one record per file. + @see FileSplit + @see MultiFileInputFormat]]> + + + + + + + + + + + + + + + <key, value> pairs output by {@link Mapper}s + and {@link Reducer}s. + +

OutputCollector is the generalization of the facility + provided by the Map-Reduce framework to collect data output by either the + Mapper or the Reducer i.e. intermediate outputs + or the output of the job.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OutputCommitter describes the commit of task output for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputCommitter of + the job to:

+

    +
  1. + Setup the job during initialization. For example, create the temporary + output directory for the job during the initialization of the job. +
  2. +
  3. + Cleanup the job after the job completion. For example, remove the + temporary output directory after the job completion. +
  4. +
  5. + Setup the task temporary output. +
  6. +
  7. + Check whether a task needs a commit. This is to avoid the commit + procedure if a task does not need commit. +
  8. +
  9. + Commit of the task output. +
  10. +
  11. + Discard the task commit. +
  12. +
+ + @see FileOutputCommitter + @see JobContext + @see TaskAttemptContext]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param ignored + @param job job configuration. + @throws IOException when output should not be attempted]]> +
+
+ + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter + @see JobConf]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be paritioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0 to 1.0. + @throws IOException]]> + + + + RecordReader reads <key, value> pairs from an + {@link InputSplit}. + +

RecordReader, typically, converts the byte-oriented view of + the input, provided by the InputSplit, and presents a + record-oriented view for the {@link Mapper} & {@link Reducer} tasks for + processing. It thus assumes the responsibility of processing record + boundaries and presenting the tasks with keys and values.

+ + @see InputSplit + @see InputFormat]]> +
+
+ + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param reporter facility to report progress. + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + Reduces values for a given key. + +

The framework calls this method for each + <key, (list of values)> pair in the grouped inputs. + Output values must be of the same type as input values. Input keys must + not be altered. The framework will reuse the key and value objects + that are passed into the reduce, therefore the application should clone + the objects they want to keep a copy of. In many cases, all values are + combined into zero or one value. +

+ +

Output pairs are collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the key. + @param values the list of values to reduce. + @param output to collect keys and combined values. + @param reporter facility to report progress.]]> +
+ + + The number of Reducers for the job is set by the user via + {@link JobConf#setNumReduceTasks(int)}. Reducer implementations + can access the {@link JobConf} for the job via the + {@link JobConfigurable#configure(JobConf)} method and initialize themselves. + Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    Reducer is input the grouped output of a {@link Mapper}. + In the phase the framework, for each Reducer, fetches the + relevant partition of the output of all the Mappers, via HTTP. +

    +
  2. + +
  3. +

    Sort

    + +

    The framework groups Reducer inputs by keys + (since different Mappers may have output the same key) in this + stage.

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    If equivalence rules for keys while grouping the intermediates are + different from those for grouping keys before reduction, then one may + specify a Comparator via + {@link JobConf#setOutputValueGroupingComparator(Class)}.Since + {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to + control how intermediate keys are grouped, these can be used in conjunction + to simulate secondary sort on values.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterator, OutputCollector, Reporter)} + method is called for each <key, (list of values)> pair in + the grouped inputs.

    +

    The output of the reduce task is typically written to the + {@link FileSystem} via + {@link OutputCollector#collect(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+     public class MyReducer<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Reducer<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+        
+       private String reduceTaskId;
+       private int noKeys = 0;
+       
+       public void configure(JobConf job) {
+         reduceTaskId = job.get("mapred.task.id");
+       }
+       
+       public void reduce(K key, Iterator<V> values,
+                          OutputCollector<K, V> output, 
+                          Reporter reporter)
+       throws IOException {
+       
+         // Process
+         int noValues = 0;
+         while (values.hasNext()) {
+           V value = values.next();
+           
+           // Increment the no. of values for this key
+           ++noValues;
+           
+           // Process the <key, value> pair (assume this takes a while)
+           // ...
+           // ...
+           
+           // Let the framework know that we are alive, and kicking!
+           if ((noValues%10) == 0) {
+             reporter.progress();
+           }
+         
+           // Process some more
+           // ...
+           // ...
+           
+           // Output the <key, value> 
+           output.collect(key, value);
+         }
+         
+         // Increment the no. of <key, list of values> pairs processed
+         ++noKeys;
+         
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+         
+         // Every 100 keys update application-level status
+         if ((noKeys%100) == 0) {
+           reporter.setStatus(reduceTaskId + " processed " + noKeys);
+         }
+       }
+     }
+ 

+ + @see Mapper + @see Partitioner + @see Reporter + @see MapReduceBase]]> +
+
+ + + + + + + + + + + + + + + Counter of the given group/name.]]> + + + + + + + Enum. + @param amount A non-negative amount by which the counter is to + be incremented.]]> + + + + + + + + + + + + + + InputSplit that the map is reading from. + @throws UnsupportedOperationException if called outside a mapper]]> + + + + + + + + + {@link Mapper} and {@link Reducer} can use the Reporter + provided to report progress or just indicate that they are alive. In + scenarios where the application takes an insignificant amount of time to + process individual key/value pairs, this is crucial since the framework + might assume that the task has timed-out and kill that task. + +

Applications can also update {@link Counters} via the provided + Reporter .

+ + @see Progressable + @see Counters]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + progress of the job's cleanup-tasks, as a float between 0.0 + and 1.0. When all cleanup tasks have completed, the function returns 1.0. + + @return the progress of the job's cleanup-tasks. + @throws IOException]]> + + + + + + progress of the job's setup-tasks, as a float between 0.0 + and 1.0. When all setup tasks have completed, the function returns 1.0. + + @return the progress of the job's setup-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RunningJob is the user-interface to query for details on a + running Map-Reduce job. + +

Clients can get hold of RunningJob via the {@link JobClient} + and then query the running-job for details such as name, configuration, + progress etc.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This allows the user to specify the key class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + This allows the user to specify the value class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f. The filtering criteria is + MD5(key) % f == 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + f using + the criteria record# % f == 0. + For example, if the frequency is 10, one out of 10 records is returned.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if auto increment + {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}. + false otherwise.]]> + + + + + + + + + + + + + true if auto increment + {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}. + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop provides an optional mode of execution in which the bad records + are detected and skipped in further attempts. + +

This feature can be used when map/reduce tasks crashes deterministically on + certain input. This happens due to bugs in the map/reduce function. The usual + course would be to fix these bugs. But sometimes this is not possible; + perhaps the bug is in third party libraries for which the source code is + not available. Due to this, the task never reaches to completion even with + multiple attempts and complete data for that task is lost.

+ +

With this feature, only a small portion of data is lost surrounding + the bad record, which may be acceptable for some user applications. + see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}

+ +

The skipping mode gets kicked off after certain no of failures + see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}

+ +

In the skipping mode, the map/reduce task maintains the record range which + is getting processed at all times. Before giving the input to the + map/reduce function, it sends this record range to the Task tracker. + If task crashes, the Task tracker knows which one was the last reported + range. On further attempts that range get skipped.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all task attempt IDs + of any jobtracker, in any job, of the first + map task, we would use : +
 
+ TaskAttemptID.getTaskAttemptIDsPattern(null, null, true, 1, null);
+ 
+ which will return : +
 "attempt_[^_]*_[0-9]*_m_000001_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @param attemptId the task attempt number, or null + @return a regex pattern matching TaskAttemptIDs]]> +
+
+ + + An example TaskAttemptID is : + attempt_200707121733_0003_m_000005_0 , which represents the + zeroth task attempt for the fifth map task in the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse TaskAttemptID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the first map task + of any jobtracker, of any job, we would use : +

 
+ TaskID.getTaskIDsPattern(null, null, true, 1);
+ 
+ which will return : +
 "task_[^_]*_[0-9]*_m_000001*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @return a regex pattern matching TaskIDs]]> +
+ + + + An example TaskID is : + task_200707121733_0003_m_000005 , which represents the + fifth map task in the third job running at the jobtracker + started at 200707121733. +

+ Applications should never construct or parse TaskID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.log.dir.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the Job was added.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ([,]*) + func ::= tbl(,"") + class ::= @see java.lang.Class#forName(java.lang.String) + path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) + } + Reads expression from the mapred.join.expr property and + user-supplied join types from mapred.join.define.<ident> + types. Paths supplied to tbl are given as input paths to the + InputFormat class listed. + @see #compose(java.lang.String, java.lang.Class, java.lang.String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ,

) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + mapred.join.define.<ident> to a classname. In the expression + mapred.join.expr, the identifier will be assumed to be a + ComposableRecordReader. + mapred.join.keycomparator can be a classname used to compare keys + in the join. + @see JoinRecordReader + @see MultiFilterRecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + capacity children to position + id in the parent reader. + The id of a root CompositeRecordReader is -1 by convention, but relying + on this is not recommended.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + override(S1,S2,S3) will prefer values + from S3 over S2, and values from S2 over S1 for all keys + emitted from all sources.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [,,...,]]]> + + + + + + + out. + TupleWritable format: + {@code + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Mapper leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Mapper does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Mapper the configuration given for it, + mapperConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain +

+ + @param job job's JobConf to add the Mapper class. + @param klass the Mapper class to add. + @param inputKeyClass mapper input key class. + @param inputValueClass mapper input value class. + @param outputKeyClass mapper output key class. + @param outputValueClass mapper output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param mapperConf a JobConf with the configuration for the Mapper + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + If this method is overriden super.configure(...) should be + invoked at the beginning of the overwriter method.]]> + + + + + + + + + + map(...) methods of the Mappers in the chain.]]> + + + + + + + If this method is overriden super.close() should be + invoked at the end of the overwriter method.]]> + + + + + The Mapper classes are invoked in a chained (or piped) fashion, the output of + the first becomes the input of the second, and so on until the last Mapper, + the output of the last Mapper will be written to the task's output. +

+ The key functionality of this feature is that the Mappers in the chain do not + need to be aware that they are executed in a chain. This enables having + reusable specialized Mappers that can be combined to perform composite + operations within a single task. +

+ Special care has to be taken when creating chains that the key/values output + by a Mapper are valid for the following Mapper in the chain. It is assumed + all Mappers and the Reduce in the chain use maching output and input key and + value classes as no conversion is done by the chaining code. +

+ Using the ChainMapper and the ChainReducer classes is possible to compose + Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And + immediate benefit of this pattern is a dramatic reduction in disk IO. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain. +

+ ChainMapper usage pattern: +

+

+ ...
+ conf.setJobName("chain");
+ conf.setInputFormat(TextInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+ 

+ JobConf mapAConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, AMap.class, LongWritable.class, Text.class, + Text.class, Text.class, true, mapAConf); +

+ JobConf mapBConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, BMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, mapBConf); +

+ JobConf reduceConf = new JobConf(false); + ... + ChainReducer.setReducer(conf, XReduce.class, LongWritable.class, Text.class, + Text.class, Text.class, true, reduceConf); +

+ ChainReducer.addMapper(conf, CMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, null); +

+ ChainReducer.addMapper(conf, DMap.class, LongWritable.class, Text.class, + LongWritable.class, LongWritable.class, true, null); +

+ FileInputFormat.setInputPaths(conf, inDir); + FileOutputFormat.setOutputPath(conf, outDir); + ... +

+ JobClient jc = new JobClient(conf); + RunningJob job = jc.submitJob(conf); + ... +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Reducer leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Reducer does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Reducer the configuration given for it, + reducerConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainReducer, this is done by the setReducer or the addMapper for the last + element in the chain. + + @param job job's JobConf to add the Reducer class. + @param klass the Reducer class to add. + @param inputKeyClass reducer input key class. + @param inputValueClass reducer input value class. + @param outputKeyClass reducer output key class. + @param outputValueClass reducer output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param reducerConf a JobConf with the configuration for the Reducer + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Mapper leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Mapper does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Mapper the configuration given for it, + mapperConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain + . + + @param job chain job's JobConf to add the Mapper class. + @param klass the Mapper class to add. + @param inputKeyClass mapper input key class. + @param inputValueClass mapper input value class. + @param outputKeyClass mapper output key class. + @param outputValueClass mapper output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param mapperConf a JobConf with the configuration for the Mapper + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + If this method is overriden super.configure(...) should be + invoked at the beginning of the overwriter method.]]> + + + + + + + + + + reduce(...) method of the Reducer with the + map(...) methods of the Mappers in the chain.]]> + + + + + + + If this method is overriden super.close() should be + invoked at the end of the overwriter method.]]> + + + + + For each record output by the Reducer, the Mapper classes are invoked in a + chained (or piped) fashion, the output of the first becomes the input of the + second, and so on until the last Mapper, the output of the last Mapper will + be written to the task's output. +

+ The key functionality of this feature is that the Mappers in the chain do not + need to be aware that they are executed after the Reducer or in a chain. + This enables having reusable specialized Mappers that can be combined to + perform composite operations within a single task. +

+ Special care has to be taken when creating chains that the key/values output + by a Mapper are valid for the following Mapper in the chain. It is assumed + all Mappers and the Reduce in the chain use maching output and input key and + value classes as no conversion is done by the chaining code. +

+ Using the ChainMapper and the ChainReducer classes is possible to compose + Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And + immediate benefit of this pattern is a dramatic reduction in disk IO. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainReducer, this is done by the setReducer or the addMapper for the last + element in the chain. +

+ ChainReducer usage pattern: +

+

+ ...
+ conf.setJobName("chain");
+ conf.setInputFormat(TextInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+ 

+ JobConf mapAConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, AMap.class, LongWritable.class, Text.class, + Text.class, Text.class, true, mapAConf); +

+ JobConf mapBConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, BMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, mapBConf); +

+ JobConf reduceConf = new JobConf(false); + ... + ChainReducer.setReducer(conf, XReduce.class, LongWritable.class, Text.class, + Text.class, Text.class, true, reduceConf); +

+ ChainReducer.addMapper(conf, CMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, null); +

+ ChainReducer.addMapper(conf, DMap.class, LongWritable.class, Text.class, + LongWritable.class, LongWritable.class, true, null); +

+ FileInputFormat.setInputPaths(conf, inDir); + FileOutputFormat.setOutputPath(conf, outDir); + ... +

+ JobClient jc = new JobClient(conf); + RunningJob job = jc.submitJob(conf); + ... +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + @param freq The frequency with which records will be emitted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + This will read every split at the client, which is very expensive. + @param freq Probability with which a key will be chosen. + @param numSamples Total number of samples to obtain from all selected + splits.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + Takes the first numSamples / numSplits records from each split. + @param numSamples Total number of samples to obtain from all selected + splits.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the name output is multi, false + if it is single. If the name output is not defined it returns + false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @param conf job conf to add the named output + @param namedOutput named output name, it has to be a word, letters + and numbers only, cannot be the word 'part' as + that is reserved for the + default output. + @param outputFormatClass OutputFormat class. + @param keyClass key class + @param valueClass value class]]> + + + + + + + + + + + + @param conf job conf to add the named output + @param namedOutput named output name, it has to be a word, letters + and numbers only, cannot be the word 'part' as + that is reserved for the + default output. + @param outputFormatClass OutputFormat class. + @param keyClass key class + @param valueClass value class]]> + + + + + + + + By default these counters are disabled. +

+ MultipleOutputs supports counters, by default the are disabled. + The counters group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. + + @param conf job conf to enableadd the named output. + @param enabled indicates if the counters will be enabled or not.]]> +
+
+ + + + + By default these counters are disabled. +

+ MultipleOutputs supports counters, by default the are disabled. + The counters group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. + + + @param conf job conf to enableadd the named output. + @return TRUE if the counters are enabled, FALSE if they are disabled.]]> +
+
+ + + + + + + + + + + + + @param namedOutput the named output name + @param reporter the reporter + @return the output collector for the given named output + @throws IOException thrown if output collector could not be created]]> + + + + + + + + + + + @param namedOutput the named output name + @param multiName the multi name part + @param reporter the reporter + @return the output collector for the given named output + @throws IOException thrown if output collector could not be created]]> + + + + + + + If overriden subclasses must invoke super.close() at the + end of their close() + + @throws java.io.IOException thrown if any of the MultipleOutput files + could not be closed properly.]]> + + + + OutputCollector passed to + the map() and reduce() methods of the + Mapper and Reducer implementations. +

+ Each additional output, or named output, may be configured with its own + OutputFormat, with its own key class and with its own value + class. +

+ A named output can be a single file or a multi file. The later is refered as + a multi named output. +

+ A multi named output is an unbound set of files all sharing the same + OutputFormat, key class and value class configuration. +

+ When named outputs are used within a Mapper implementation, + key/values written to a name output are not part of the reduce phase, only + key/values written to the job OutputCollector are part of the + reduce phase. +

+ MultipleOutputs supports counters, by default the are disabled. The counters + group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. +

+ Job configuration usage pattern is: +

+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+   SequenceFileOutputFormat.class,
+   LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+ 
+

+ Job configuration usage pattern is: +

+
+ public class MOReduce implements
+   Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Map implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured to use this MapRunnable class (using + the JobConf.setMapRunnerClass method) and + the number of thread the thread-pool can use with the + mapred.map.multithreadedrunner.threads property, its default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pairs. Uses + {@link StringTokenizer} to break text into tokens.]]> + + + + + + + + + + + + total.order.partitioner.natural.order is not false, a trie + of the first total.order.partitioner.max.trie.depth(2) + 1 bytes + will be built. Otherwise, keys will be located using a binary search of + the partition keyset using the {@link org.apache.hadoop.io.RawComparator} + defined for this job. The input file must be sorted with the same + comparator and contain {@link + org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.]]> + + + + + + + + + + + + R reduces, there are R-1 + keys in the SequenceFile.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + generateKeyValPairs(Object key, Object value); public void + configure(JobConfjob); } + + The package also provides a base class, ValueAggregatorBaseDescriptor, + implementing the above interface. The user can extend the base class and + implement generateKeyValPairs accordingly. + + The primary work of generateKeyValPairs is to emit one or more key/value + pairs based on the input key/value pair. The key in an output key/value pair + encode two pieces of information: aggregation type and aggregation id. The + value will be aggregated onto the aggregation id according the aggregation + type. + + This class offers a function to generate a map/reduce job using Aggregate + framework. The function takes the following parameters: input directory spec + input format (text or sequence file) output directory a file specifying the + user plugin class]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The job can be configured using the static methods in this class, + {@link DBInputFormat}, and {@link DBOutputFormat}. +

+ Alternatively, the properties can be set in the configuration with proper + values. + + @see DBConfiguration#configureDB(JobConf, String, String, String, String) + @see DBInputFormat#setInput(JobConf, Class, String, String) + @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...) + @see DBOutputFormat#setOutput(JobConf, String, String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 20070101 AND length > 0)' + @param orderBy the fieldNames in the orderBy clause. + @param fieldNames The field names in the table + @see #setInput(JobConf, Class, String, String)]]> + + + + + + + + + + + + + + DBInputFormat emits LongWritables containing the record number as + key and DBWritables as value. + + The SQL query, and input class can be using one of the two + setInput methods.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {@link DBOutputFormat} accepts <key,value> pairs, where + key has a type extending DBWritable. Returned {@link RecordWriter} + writes only the key to the database with a batch SQL query.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DBWritable. DBWritable, is similar to {@link Writable} + except that the {@link #write(PreparedStatement)} method takes a + {@link PreparedStatement}, and {@link #readFields(ResultSet)} + takes a {@link ResultSet}. +

+ Implementations are responsible for writing the fields of the object + to PreparedStatement, and reading the fields of the object from the + ResultSet. + +

Example:

+ If we have the following table in the database : +
+ CREATE TABLE MyTable (
+   counter        INTEGER NOT NULL,
+   timestamp      BIGINT  NOT NULL,
+ );
+ 
+ then we can read/write the tuples from/to the table with : +

+ public class MyWritable implements Writable, DBWritable {
+   // Some data     
+   private int counter;
+   private long timestamp;
+       
+   //Writable#write() implementation
+   public void write(DataOutput out) throws IOException {
+     out.writeInt(counter);
+     out.writeLong(timestamp);
+   }
+       
+   //Writable#readFields() implementation
+   public void readFields(DataInput in) throws IOException {
+     counter = in.readInt();
+     timestamp = in.readLong();
+   }
+       
+   public void write(PreparedStatement statement) throws SQLException {
+     statement.setInt(1, counter);
+     statement.setLong(2, timestamp);
+   }
+       
+   public void readFields(ResultSet resultSet) throws SQLException {
+     counter = resultSet.getInt(1);
+     timestamp = resultSet.getLong(2);
+   } 
+ }
+ 

]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When constructing the instance, if the factory property + contextName.class exists, + its value is taken to be the name of the class to instantiate. Otherwise, + the default is to create an instance of + org.apache.hadoop.metrics.spi.NullContext, which is a + dummy "no-op" context which will cause all metric data to be discarded. + + @param contextName the name of the context + @return the named MetricsContext]]> + + + + + + + + + + + + + + When the instance is constructed, this method checks if the file + hadoop-metrics.properties exists on the class path. If it + exists, it must be in the format defined by java.util.Properties, and all + the properties in the file are set as attributes on the newly created + ContextFactory instance. + + @return the singleton ContextFactory instance]]> + + + + getFactory() method.]]> + + + + + + + + + + + + + + + + + + + startMonitoring() again after calling + this. + @see #close()]]> + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record name identifies the kind of data to be reported. For example, a + program reporting statistics relating to the disks on a computer might use + a record name "diskStats".

+ + A record has zero or more tags. A tag has a name and a value. To + continue the example, the "diskStats" record might use a tag named + "diskName" to identify a particular disk. Sometimes it is useful to have + more than one tag, so there might also be a "diskType" with value "ide" or + "scsi" or whatever.

+ + A record also has zero or more metrics. These are the named + values that are to be reported to the metrics system. In the "diskStats" + example, possible metric names would be "diskPercentFull", "diskPercentBusy", + "kbReadPerSecond", etc.

+ + The general procedure for using a MetricsRecord is to fill in its tag and + metric values, and then call update() to pass the record to the + client library. + Metric data is not immediately sent to the metrics system + each time that update() is called. + An internal table is maintained, identified by the record name. This + table has columns + corresponding to the tag and the metric names, and rows + corresponding to each unique set of tag values. An update + either modifies an existing row in the table, or adds a new row with a set of + tag values that are different from all the other rows. Note that if there + are no tags, then there can be at most one row in the table.

+ + Once a row is added to the table, its data will be sent to the metrics system + on every timer period, whether or not it has been updated since the previous + timer period. If this is inappropriate, for example if metrics were being + reported by some transient object in an application, the remove() + method can be used to remove the row and thus stop the data from being + sent.

+ + Note that the update() method is atomic. This means that it is + safe for different threads to be updating the same metric. More precisely, + it is OK for different threads to call update() on MetricsRecord instances + with the same set of tag names and tag values. Different threads should + not use the same MetricsRecord instance at the same time.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MetricsContext.registerUpdater().]]> + + + + + + + + + + + + + + + + + + + + + + + + + fileName attribute, + if specified. Otherwise the data will be written to standard + output.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is configured by setting ContextFactory attributes which in turn + are usually configured through a properties file. All the attributes are + prefixed by the contextName. For example, the properties file might contain: +

+ myContextName.fileName=/tmp/metrics.log
+ myContextName.period=5
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contextName.tableName. The returned map consists of + those attributes with the contextName and tableName stripped off.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class implements the internal table of metric data, and the timer + on which data is to be sent to the metrics system. Subclasses must + override the abstract emitRecord method in order to transmit + the data.

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + update + and remove().]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hostname or hostname:port. If + the specs string is null, defaults to localhost:defaultPort. + + @return a list of InetSocketAddress objects.]]> + + + + + + + + + + + + + + + + + + + ,name=" + Where the and are the supplied parameters + + @param serviceName + @param nameName + @param theMbean - the MBean to register + @return the named used to register the MBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.rpc.socket.factory.class.<ClassName>. When no + such parameter exists then fall back on the default socket factory as + configured by hadoop.rpc.socket.factory.class.default. If + this default socket factory is not configured, then fall back on the JVM + default socket factory. + + @param conf the configuration + @param clazz the class (usually a {@link VersionedProtocol}) + @return a socket factory]]> + + + + + + hadoop.rpc.socket.factory.default + + @param conf the configuration + @return the default socket factory as specified in the configuration or + the JVM default socket factory if the configuration does not + contain a default socket factory property.]]> + + + + + + + + + + + + + : + ://:/]]> + + + + + + + + : + ://:/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketInputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getInputStream()} is returned. In the later + case, the timeout argument is ignored and the timeout set with + {@link Socket#setSoTimeout(int)} applies for reads.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see #getInputStream(Socket, long) + + @param socket + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + +
+ + From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketOutputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getOutputStream()} is returned. In the later + case, the timeout argument is ignored and the write will wait until + data is available.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see #getOutputStream(Socket, long) + + @param socket + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + node + + @param node + a node + @return true if node is already in the tree; false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scope + if scope starts with ~, choose one from the all nodes except for the + ones in scope; otherwise, choose one from scope + @param scope range of nodes from which a node will be choosen + @return the choosen node]]> + + + + + + + scope but not in excludedNodes + if scope starts with ~, return the number of nodes that are not + in scope and excludedNodes; + @param scope a path string that may start with ~ + @param excludedNodes a list of nodes + @return number of available nodes]]> + + + + + + + + + + + + reader + It linearly scans the array, if a local node is found, swap it with + the first element of the array. + If a local rack node is found, swap it with the first element following + the local node. + If neither local node or local rack node is found, put a random replica + location at postion 0. + It leaves the rest nodes untouched.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + +
+ + Create a new ouput stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = getCount(). + @param newCapacity The new capacity in bytes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index idx = startVector(...); + while (!idx.done()) { + .... // read element of a vector + idx.incr(); + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This task takes the given record definition files and compiles them into + java or c++ + files. It is then up to the user to compile the generated files. + +

The task requires the file or the nested fileset element to be + specified. Optional attributes are language (set the output + language, default is "java"), + destdir (name of the destination directory for generated java/c++ + code, default is ".") and failonerror (specifies error handling + behavior. default is true). +

Usage

+
+ <recordcc
+       destdir="${basedir}/gensrc"
+       language="java">
+   <fileset include="**\/*.jr" />
+ </recordcc>
+ 
]]> +
+
+ +
+ + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi as a comma separated string in + conf as a property attr + + The String starts with the user name followed by the default group names, + and other group names. + + @param conf configuration + @param attr property name + @param ugi a UnixUserGroupInformation]]> + + + + + + + + conf + + The object is expected to store with the property name attr + as a comma separated string that starts + with the user name followed by group names. + If the property name is not defined, return null. + It's assumed that there is only one UGI per user. If this user already + has a UGI in the ugi map, return the ugi in the map. + Otherwise, construct a UGI from the configuration, store it in the + ugi map and return it. + + @param conf configuration + @param attr property name + @return a UnixUGI + @throws LoginException if the stored string is ill-formatted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool supports archiving and anaylzing (sort/grep) of log-files. + It takes as input + a) Input uri which will serve uris of the logs to be archived. + b) Output directory (not mandatory). + b) Directory on dfs to archive the logs. + c) The sort/grep patterns for analyzing the files and separator for boundaries. + Usage: + Logalyzer -archive -archiveDir -analysis -logs -grep -sort -separator +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + in]]> + + + + + + + out.]]> + + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GenericOptionsParser to parse only the generic Hadoop + arguments. + + The array of string arguments other than the generic arguments can be + obtained by {@link #getRemainingArgs()}. + + @param conf the Configuration to modify. + @param args command-line arguments.]]> + + + + + GenericOptionsParser to parse given options as well + as generic Hadoop options. + + The resulting CommandLine object can be obtained by + {@link #getCommandLine()}. + + @param conf the configuration to modify + @param options options built by the caller + @param args User-specified arguments]]> + + + + + Strings containing the un-parsed arguments + or empty array if commandLine was not defined.]]> + + + + + CommandLine object + to process the parsed arguments. + + Note: If the object is created with + {@link #GenericOptionsParser(Configuration, String[])}, then returned + object will only contain parsed generic options. + + @return CommandLine representing list of arguments + parsed against Options descriptor.]]> + + + + + + + + + + + + + + + + + GenericOptionsParser is a utility to parse command line + arguments generic to the Hadoop framework. + + GenericOptionsParser recognizes several standarad command + line arguments, enabling applications to easily specify a namenode, a + jobtracker, additional configuration resources etc. + +

Generic Options

+ +

The supported generic options are:

+

+     -conf <configuration file>     specify a configuration file
+     -D <property=value>            use value for given property
+     -fs <local|namenode:port>      specify a namenode
+     -jt <local|jobtracker:port>    specify a job tracker
+     -files <comma separated list of files>    specify comma separated
+                            files to be copied to the map reduce cluster
+     -libjars <comma separated list of jars>   specify comma separated
+                            jar files to include in the classpath.
+     -archives <comma separated list of archives>    specify comma
+             separated archives to be unarchived on the compute machines.
+
+ 

+ +

The general command line syntax is:

+

+ bin/hadoop command [genericOptions] [commandOptions]
+ 

+ +

Generic command line arguments might modify + Configuration objects, given to constructors.

+ +

The functionality is implemented using Commons CLI.

+ +

Examples:

+

+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+ 
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+     
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+     
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+ 
+ $ bin/hadoop jar -libjars testlib.jar 
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+ 

+ + @see Tool + @see ToolRunner]]> +
+
+ + + + + + + + + Class<T>) of the + argument of type T. + @param The type of the argument + @param t the object to get it class + @return Class<T>]]> + + + + + + + List<T> to a an array of + T[]. + @param c the Class object of the items in the list + @param list the list to convert]]> + + + + + + List<T> to a an array of + T[]. + @param list the list to convert + @throws ArrayIndexOutOfBoundsException if the list is empty. + Use {@link #toArray(Class, List)} if the list may be empty.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + io.file.buffer.size specified in the given + Configuration. + @param in input stream + @param conf configuration + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-hadoop is loaded, + else false]]> + + + + + + true if native hadoop libraries, if present, can be + used for this job; false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { pq.top().change(); pq.adjustTop(); } + instead of
+  { o = pq.pop(); o.change(); pq.push(o); }
+ 
]]> +
+
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clients and/or applications can use the provided Progressable + to explicitly report progress to the Hadoop framework. This is especially + important for operations which take an insignificant amount of time since, + in-lieu of the reported progress, the framework has to assume that an error + has occured and time-out the operation.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Class is to be obtained + @return the correctly typed Class of the given object.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop Pipes + or Hadoop Streaming. + + It also checks to ensure that we are running on a *nix platform else + (e.g. in Cygwin/Windows) it returns null. + @param conf configuration + @return a String[] with the ulimit command arguments or + null if we are running on a non *nix platform or + if the limit is unspecified.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell interface. + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell can be used to run unix commands like du or + df. It also offers facilities to gate commands by + time-intervals.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ShellCommandExecutorshould be used in cases where the output + of the command needs no explicit parsing and where the command, working + directory and the environment remains unchanged. The output of the command + is stored as-is and is expected to be small.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ArrayList of string values]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the char to be escaped + @return an escaped string]]> + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the escaped char + @return an unescaped string]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tool, is the standard for any Map-Reduce tool/application. + The tool/application should delegate the handling of + + standard command-line options to {@link ToolRunner#run(Tool, String[])} + and only handle its custom arguments.

+ +

Here is how a typical Tool is implemented:

+

+     public class MyApp extends Configured implements Tool {
+     
+       public int run(String[] args) throws Exception {
+         // Configuration processed by ToolRunner
+         Configuration conf = getConf();
+         
+         // Create a JobConf using the processed conf
+         JobConf job = new JobConf(conf, MyApp.class);
+         
+         // Process custom command-line options
+         Path in = new Path(args[1]);
+         Path out = new Path(args[2]);
+         
+         // Specify various job-specific parameters     
+         job.setJobName("my-app");
+         job.setInputPath(in);
+         job.setOutputPath(out);
+         job.setMapperClass(MyApp.MyMapper.class);
+         job.setReducerClass(MyApp.MyReducer.class);
+
+         // Submit the job, then poll for progress until the job is complete
+         JobClient.runJob(job);
+       }
+       
+       public static void main(String[] args) throws Exception {
+         // Let ToolRunner handle generic command-line options 
+         int res = ToolRunner.run(new Configuration(), new Sort(), args);
+         
+         System.exit(res);
+       }
+     }
+ 

+ + @see GenericOptionsParser + @see ToolRunner]]> +
+
+ + + + + + + + + + + + Tool by {@link Tool#run(String[])}, after + parsing with the given generic arguments. Uses the given + Configuration, or builds one if null. + + Sets the Tool's configuration with the possibly modified + version of the conf. + + @param conf Configuration for the Tool. + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + Tool with its Configuration. + + Equivalent to run(tool.getConf(), tool, args). + + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + + + ToolRunner can be used to run classes implementing + Tool interface. It works in conjunction with + {@link GenericOptionsParser} to parse the + + generic hadoop command line arguments and modifies the + Configuration of the Tool. The + application-specific options are passed along without being modified. +

+ + @see Tool + @see GenericOptionsParser]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
diff --git a/lib/jdiff/hadoop_0.20.0.xml b/lib/jdiff/hadoop_0.20.0.xml new file mode 100644 index 00000000000..ce6f91bfe60 --- /dev/null +++ b/lib/jdiff/hadoop_0.20.0.xml @@ -0,0 +1,52140 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + final. + + @param name resource to be added, the classpath is examined for a file + with that name.]]> + + + + + + final. + + @param url url of the resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param file file-path of resource to be added, the local filesystem is + examined directly to find the resource, without referring to + the classpath.]]> + + + + + + final. + + @param in InputStream to deserialize the object from.]]> + + + + + + + + + + + name property, null if + no such property exists. + + Values are processed for variable expansion + before being returned. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + name property, without doing + variable expansion. + + @param name the property name. + @return the value of the name property, + or null if no such property exists.]]> + + + + + + + value of the name property. + + @param name property name. + @param value property value.]]> + + + + + + + + + + + + + + name property. If no such property + exists, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value, or defaultValue if the property + doesn't exist.]]> + + + + + + + name property as an int. + + If no such property exists, or if the specified value is not a valid + int, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as an int, + or defaultValue.]]> + + + + + + + name property to an int. + + @param name property name. + @param value int value of the property.]]> + + + + + + + name property as a long. + If no such property is specified, or if the specified value is not a valid + long, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a long, + or defaultValue.]]> + + + + + + + name property to a long. + + @param name property name. + @param value long value of the property.]]> + + + + + + + name property as a float. + If no such property is specified, or if the specified value is not a valid + float, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a float, + or defaultValue.]]> + + + + + + + name property to a float. + + @param name property name. + @param value property value.]]> + + + + + + + name property as a boolean. + If no such property is specified, or if the specified value is not a valid + boolean, then defaultValue is returned. + + @param name property name. + @param defaultValue default value. + @return property value as a boolean, + or defaultValue.]]> + + + + + + + name property to a boolean. + + @param name property name. + @param value boolean value of the property.]]> + + + + + + + + + + + + + + + + + + + + name property as + a collection of Strings. + If no such property is specified then empty collection is returned. +

+ This is an optimized version of {@link #getStrings(String)} + + @param name property name. + @return property value as a collection of Strings.]]> + + + + + + name property as + an array of Strings. + If no such property is specified then null is returned. + + @param name property name. + @return property value as an array of Strings, + or null.]]> + + + + + + + name property as + an array of Strings. + If no such property is specified then default value is returned. + + @param name property name. + @param defaultValue The default value + @return property value as an array of Strings, + or default value.]]> + + + + + + + name property as + as comma delimited values. + + @param name property name. + @param values The values]]> + + + + + + + + + + + + + + name property + as an array of Class. + The value of the property specifies a list of comma separated class names. + If no such property is specified, then defaultValue is + returned. + + @param name the property name. + @param defaultValue default value. + @return property value as a Class[], + or defaultValue.]]> + + + + + + + name property as a Class. + If no such property is specified, then defaultValue is + returned. + + @param name the class name. + @param defaultValue default value. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property as a Class + implementing the interface specified by xface. + + If no such property is specified, then defaultValue is + returned. + + An exception is thrown if the returned class does not implement the named + interface. + + @param name the class name. + @param defaultValue default value. + @param xface the interface implemented by the named class. + @return property value as a Class, + or defaultValue.]]> + + + + + + + + name property to the name of a + theClass implementing the given interface xface. + + An exception is thrown if theClass does not implement the + interface xface. + + @param name property name. + @param theClass property value. + @param xface the interface implemented by the named class.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + dirsProp with + the given path. If dirsProp contains multiple directories, + then one is chosen based on path's hash code. If the selected + directory does not exist, an attempt is made to create it. + + @param dirsProp directory in which to locate the file. + @param path file-path. + @return local file under the directory with the given path.]]> + + + + + + + + + + + + name. + + @param name configuration resource name. + @return an input stream attached to the resource.]]> + + + + + + name. + + @param name configuration resource name. + @return a reader attached to the resource.]]> + + + + + + + + + + + + + + + String + key-value pairs in the configuration. + + @return an iterator over the entries.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + true to set quiet-mode on, false + to turn it off.]]> + + + + + + + + + + + + + + + + + + + Resources + +

Configurations are specified by resources. A resource contains a set of + name/value pairs as XML data. Each resource is named by either a + String or by a {@link Path}. If named by a String, + then the classpath is examined for a file with that name. If named by a + Path, then the local filesystem is examined directly, without + referring to the classpath. + +

Unless explicitly turned off, Hadoop by default specifies two + resources, loaded in-order from the classpath:

    +
  1. core-default.xml + : Read-only defaults for hadoop.
  2. +
  3. core-site.xml: Site-specific configuration for a given hadoop + installation.
  4. +
+ Applications may add additional resources, which are loaded + subsequent to these resources in the order they are added. + +

Final Parameters

+ +

Configuration parameters may be declared final. + Once a resource declares a value final, no subsequently-loaded + resource can alter that value. + For example, one might define a final parameter with: +

+  <property>
+    <name>dfs.client.buffer.dir</name>
+    <value>/tmp/hadoop/dfs/client</value>
+    <final>true</final>
+  </property>
+ + Administrators typically define parameters as final in + core-site.xml for values that user applications may not alter. + +

Variable Expansion

+ +

Value strings are first processed for variable expansion. The + available properties are:

    +
  1. Other properties defined in this Configuration; and, if a name is + undefined here,
  2. +
  3. Properties in {@link System#getProperties()}.
  4. +
+ +

For example, if a configuration resource contains the following property + definitions: +

+  <property>
+    <name>basedir</name>
+    <value>/user/${user.name}</value>
+  </property>
+  
+  <property>
+    <name>tempdir</name>
+    <value>${basedir}/tmp</value>
+  </property>
+ + When conf.get("tempdir") is called, then ${basedir} + will be resolved to another property in this Configuration, while + ${user.name} would then ordinarily be resolved to the value + of the System property with that name.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DistributedCache is a facility provided by the Map-Reduce + framework to cache files (text, archives, jars etc.) needed by applications. +

+ +

Applications specify the files, via urls (hdfs:// or http://) to be cached + via the {@link org.apache.hadoop.mapred.JobConf}. + The DistributedCache assumes that the + files specified via hdfs:// urls are already present on the + {@link FileSystem} at the path specified by the url.

+ +

The framework will copy the necessary files on to the slave node before + any tasks for the job are executed on that node. Its efficiency stems from + the fact that the files are only copied once per job and the ability to + cache archives which are un-archived on the slaves.

+ +

DistributedCache can be used to distribute simple, read-only + data/text files and/or more complex types such as archives, jars etc. + Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes. + Jars may be optionally added to the classpath of the tasks, a rudimentary + software distribution mechanism. Files have execution permissions. + Optionally users can also direct it to symlink the distributed cache file(s) + into the working directory of the task.

+ +

DistributedCache tracks modification timestamps of the cache + files. Clearly the cache files should not be modified by the application + or externally while the job is executing.

+ +

Here is an illustrative example on how to use the + DistributedCache:

+

+     // Setting up the cache for the application
+     
+     1. Copy the requisite files to the FileSystem:
+     
+     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat  
+     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip  
+     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+     
+     2. Setup the application's JobConf:
+     
+     JobConf job = new JobConf();
+     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"), 
+                                   job);
+     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+     
+     3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+     or {@link org.apache.hadoop.mapred.Reducer}:
+     
+     public static class MapClass extends MapReduceBase  
+     implements Mapper<K, V, K, V> {
+     
+       private Path[] localArchives;
+       private Path[] localFiles;
+       
+       public void configure(JobConf job) {
+         // Get the cached archives/files
+         localArchives = DistributedCache.getLocalCacheArchives(job);
+         localFiles = DistributedCache.getLocalCacheFiles(job);
+       }
+       
+       public void map(K key, V value, 
+                       OutputCollector<K, V> output, Reporter reporter) 
+       throws IOException {
+         // Use data from the cached archives/files here
+         // ...
+         // ...
+         output.collect(k, v);
+       }
+     }
+     
+ 

+ + @see org.apache.hadoop.mapred.JobConf + @see org.apache.hadoop.mapred.JobClient]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BufferedFSInputStream + with the specified buffer size, + and saves its argument, the input stream + in, for later use. An internal + buffer array of length size + is created and stored in buf. + + @param in the underlying input stream. + @param size the buffer size. + @exception IllegalArgumentException if size <= 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + setReplication of FileSystem + @param src file name + @param replication new replication + @throws IOException + @return true if successful; + false if file does not exist or is a directory]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fs.scheme.class whose value names the FileSystem class. + The entire URI is passed to the FileSystem instance's initialize method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Return all the files that match filePattern and are not checksum + files. Results are sorted by their names. + +

+ A filename pattern is composed of regular characters and + special pattern matching characters, which are: + +

+
+
+

+

? +
Matches any single character. + +

+

* +
Matches zero or more characters. + +

+

[abc] +
Matches a single character from character set + {a,b,c}. + +

+

[a-b] +
Matches a single character from the character range + {a...b}. Note that character a must be + lexicographically less than or equal to character b. + +

+

[^a] +
Matches a single character that is not from character set or range + {a}. Note that the ^ character must occur + immediately to the right of the opening bracket. + +

+

\c +
Removes (escapes) any special meaning of character c. + +

+

{ab,cd} +
Matches a string from the string set {ab, cd} + +

+

{ab,c{de,fh}} +
Matches a string from the string set {ab, cde, cfh} + +
+
+
+ + @param pathPattern a regular expression specifying a pth pattern + + @return an array of paths that match the path pattern + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All user code that may potentially use the Hadoop Distributed + File System should be written to use a FileSystem object. The + Hadoop DFS is a multi-machine system that appears as a single + disk. It's useful because of its fault tolerance and potentially + very large capacity. + +

+ The local implementation is {@link LocalFileSystem} and distributed + implementation is DistributedFileSystem.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FilterFileSystem contains + some other file system, which it uses as + its basic file system, possibly transforming + the data along the way or providing additional + functionality. The class FilterFileSystem + itself simply overrides all methods of + FileSystem with versions that + pass all requests to the contained file + system. Subclasses of FilterFileSystem + may further override some of these methods + and may also provide additional methods + and fields.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + buf at offset + and checksum into checksum. + The method is used for implementing read, therefore, it should be optimized + for sequential reading + @param pos chunkPos + @param buf desitination buffer + @param offset offset in buf at which to store data + @param len maximun number of bytes to read + @return number of bytes read]]> + + + + + + + + + + + + + + + + + -1 if the end of the + stream is reached. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + This method implements the general contract of the corresponding + {@link InputStream#read(byte[], int, int) read} method of + the {@link InputStream} class. As an additional + convenience, it attempts to read as many bytes as possible by repeatedly + invoking the read method of the underlying stream. This + iterated read continues until one of the following + conditions becomes true:

    + +
  • The specified number of bytes have been read, + +
  • The read method of the underlying stream returns + -1, indicating end-of-file. + +
If the first read on the underlying stream returns + -1 to indicate end-of-file then this method returns + -1. Otherwise this method returns the number of bytes + actually read. + + @param b destination buffer. + @param off offset at which to start storing bytes. + @param len maximum number of bytes to read. + @return the number of bytes read, or -1 if the end of + the stream has been reached. + @exception IOException if an I/O error occurs. + ChecksumException if any checksum error occurs]]> +
+ + + + + + + + + + + + + + + + + + n bytes of data from the + input stream. + +

This method may skip more bytes than are remaining in the backing + file. This produces no exception and the number of bytes skipped + may include some number of bytes that were beyond the EOF of the + backing file. Attempting to read from the stream after skipping past + the end will result in -1 indicating the end of the file. + +

If n is negative, no bytes are skipped. + + @param n the number of bytes to be skipped. + @return the actual number of bytes skipped. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to skip to is corrupted]]> + + + + + + + This method may seek past the end of the file. + This produces no exception and an attempt to read from + the stream will result in -1 indicating the end of the file. + + @param pos the postion to seek to. + @exception IOException if an I/O error occurs. + ChecksumException if the chunk to seek to is corrupted]]> + + + + + + + + + + len bytes from + stm + + @param stm an input stream + @param buf destiniation buffer + @param offset offset at which to store data + @param len number of bytes to read + @return actual number of bytes read + @throws IOException if there is any IO error]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + len bytes from the specified byte array + starting at offset off and generate a checksum for + each data chunk. + +

This method stores bytes from the given array into this + stream's buffer before it gets checksumed. The buffer gets checksumed + and flushed to the underlying output stream when all data + in a checksum chunk are in the buffer. If the buffer is empty and + requested length is at least as large as the size of next checksum chunk + size, this method will checksum and write the chunk directly + to the underlying output stream. Thus it avoids uneccessary data copy. + + @param b the data. + @param off the start offset in the data. + @param len the number of bytes to write. + @exception IOException if an I/O error occurs.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if and only if pathname + should be included]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + trash feature. Files are moved to a user's trash + directory, a subdirectory of their home directory named ".Trash". Files are + initially moved to a current sub-directory of the trash directory. + Within that sub-directory their original path is preserved. Periodically + one may checkpoint the current trash and remove older checkpoints. (This + design permits trash management without enumeration of the full trash + content, without date support in the filesystem, and without clock + synchronization.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} backed by an FTP client provided by Apache Commons Net. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cause==null ? null : cause.toString()) (which + typically contains the class and detail message of cause). + @param cause the cause (which is saved for later retrieval by the + {@link #getCause()} method). (A null value is + permitted, and indicates that the cause is nonexistent or + unknown.)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is a tool for migrating data from an older to a newer version + of an S3 filesystem. +

+

+ All files in the filesystem are migrated by re-writing the block metadata + - no datafiles are touched. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + Extracts AWS credentials from the filesystem URI or configuration. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A block-based {@link FileSystem} backed by + Amazon S3. +

+ @see NativeS3FileSystem]]> +
+
+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If f is a file, this method will make a single call to S3. + If f is a directory, this method will make a maximum of + (n / 1000) + 2 calls to S3, where n is the total number of + files and directories contained directly in f. +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + A {@link FileSystem} for reading and writing files stored on + Amazon S3. + Unlike {@link org.apache.hadoop.fs.s3.S3FileSystem} this implementation + stores files on S3 in their + native form so they can be read by other S3 tools. +

+ @see org.apache.hadoop.fs.s3.S3FileSystem]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + @param name The name of the server + @param port The port to use on the server + @param findPort whether the server should start at the given port and + increment by 1 until it finds a free port. + @param conf Configuration]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + points to the log directory + "/static/" -> points to common static files (src/webapps/static) + "/" -> the jsp server code from (src/webapps/)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nth value.]]> + + + + + + + + + + + + + + + + + + + + + nth value in the file.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + public class IntArrayWritable extends ArrayWritable { + public IntArrayWritable() { + super(IntWritable.class); + } + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a ByteWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataInputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using DataInput methods ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new DataOutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using DataOutput methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + the class of the item + @param conf the configuration to store + @param item the object to be stored + @param keyName the name of the key to use + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param items the objects to be stored + @param keyName the name of the key to use + @throws IndexOutOfBoundsException if the items array is empty + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + + + + + the class of the item + @param conf the configuration to use + @param keyName the name of the key to use + @param itemClass the class of the item + @return restored object + @throws IOException : forwards Exceptions from the underlying + {@link Serialization} classes.]]> + + + + + DefaultStringifier offers convenience methods to store/load objects to/from + the configuration. + + @param the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a DoubleWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a FloatWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When two sequence files, which have same Key type but different Value + types, are mapped out to reduce, multiple Value types is not allowed. + In this case, this class can help you wrap instances with different types. +

+ +

+ Compared with ObjectWritable, this class is much more effective, + because ObjectWritable will append the class declaration as a String + into the output file in every Key-Value pair. +

+ +

+ Generic Writable implements {@link Configurable} interface, so that it will be + configured by the framework. The configuration is passed to the wrapped objects + implementing {@link Configurable} interface before deserialization. +

+ + how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines + the classes which will be wrapped in GenericObject in application. + Attention: this classes defined in getTypes() method, must + implement Writable interface. +

+ + The code looks like this: +
+ public class GenericObject extends GenericWritable {
+ 
+   private static Class[] CLASSES = {
+               ClassType1.class, 
+               ClassType2.class,
+               ClassType3.class,
+               };
+
+   protected Class[] getTypes() {
+       return CLASSES;
+   }
+
+ }
+ 
+ + @since Nov 8, 2006]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new InputStream and + ByteArrayInputStream each time data is read. + +

Typical usage is something like the following:

+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+   byte[] data = ... get data ...;
+   int dataLength = ... get data length ...;
+   buffer.reset(data, dataLength);
+   ... read buffer using InputStream methods ...
+ }
+ 
+ @see DataInputBuffer + @see DataOutput]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a IntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + closes the input and output streams + at the end. + @param in InputStrem to read from + @param out OutputStream to write to + @param conf the Configuration object]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ignore any {@link IOException} or + null pointers. Must only be used for cleanup in exception handlers. + @param log the log to record problems to at debug level. Can be null. + @param closeables the objects to close]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a LongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A map is a directory containing two files, the data file, + containing all keys and values in the map, and a smaller index + file, containing a fraction of the keys. The fraction is determined by + {@link Writer#getIndexInterval()}. + +

The index file is read entirely into memory. Thus key implementations + should try to keep themselves small. + +

Map files are created by adding entries in-order. To maintain a large + database, perform updates by copying the previous version of a database and + merging in a sorted change list, to create a new version of the database in + a new file. Sorting large change lists can be done with {@link + SequenceFile.Sorter}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key and + val. Returns true if such a pair exists and false when at + the end of the map]]> + + + + + + + + + + + + + + + + key or if it does not exist, at the first entry + after the named key. + +- * @param key - key that we're trying to find +- * @param val - data value if key is found +- * @return - the key that was the closest match or null if eof.]]> + + + + + + + + + key does not exist, return + the first entry that falls just before the key. Otherwise, + return the record that sorts just after. + @return - the key that was the closest match or null if eof.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is an MD5Hash whose digest contains the + same values.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This saves memory over creating a new OutputStream and + ByteArrayOutputStream each time data is written. + +

Typical usage is something like the following:

+
+ OutputBuffer buffer = new OutputBuffer();
+ while (... loop condition ...) {
+   buffer.reset();
+   ... write buffer using OutputStream methods ...
+   byte[] data = buffer.getData();
+   int dataLength = buffer.getLength();
+   ... write data to its ultimate destination ...
+ }
+ 
+ @see DataOutputBuffer + @see InputBuffer]]> +
+
+ + + + + + + + + + + + + + + A {@link Comparator} that operates directly on byte representations of + objects. +

+ @param + @see DeserializerComparator]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SequenceFiles are flat files consisting of binary key/value + pairs. + +

SequenceFile provides {@link Writer}, {@link Reader} and + {@link Sorter} classes for writing, reading and sorting respectively.

+ + There are three SequenceFile Writers based on the + {@link CompressionType} used to compress key/value pairs: +
    +
  1. + Writer : Uncompressed records. +
  2. +
  3. + RecordCompressWriter : Record-compressed files, only compress + values. +
  4. +
  5. + BlockCompressWriter : Block-compressed files, both keys & + values are collected in 'blocks' + separately and compressed. The size of + the 'block' is configurable. +
+ +

The actual compression algorithm used to compress key and/or values can be + specified by using the appropriate {@link CompressionCodec}.

+ +

The recommended way is to use the static createWriter methods + provided by the SequenceFile to chose the preferred format.

+ +

The {@link Reader} acts as the bridge and can read any of the above + SequenceFile formats.

+ +

SequenceFile Formats

+ +

Essentially there are 3 different formats for SequenceFiles + depending on the CompressionType specified. All of them share a + common header described below. + +

+
    +
  • + version - 3 bytes of magic header SEQ, followed by 1 byte of actual + version number (e.g. SEQ4 or SEQ6) +
  • +
  • + keyClassName -key class +
  • +
  • + valueClassName - value class +
  • +
  • + compression - A boolean which specifies if compression is turned on for + keys/values in this file. +
  • +
  • + blockCompression - A boolean which specifies if block-compression is + turned on for keys/values in this file. +
  • +
  • + compression codec - CompressionCodec class which is used for + compression of keys and/or values (if compression is + enabled). +
  • +
  • + metadata - {@link Metadata} for this file. +
  • +
  • + sync - A sync marker to denote end of the header. +
  • +
+ +
Uncompressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Record-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record +
      +
    • Record length
    • +
    • Key length
    • +
    • Key
    • +
    • Compressed Value
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +
Block-Compressed SequenceFile Format
+
    +
  • + Header +
  • +
  • + Record Block +
      +
    • Compressed key-lengths block-size
    • +
    • Compressed key-lengths block
    • +
    • Compressed keys block-size
    • +
    • Compressed keys block
    • +
    • Compressed value-lengths block-size
    • +
    • Compressed value-lengths block
    • +
    • Compressed values block-size
    • +
    • Compressed values block
    • +
    +
  • +
  • + A sync-marker every few 100 bytes or so. +
  • +
+ +

The compressed blocks of key lengths and value lengths consist of the + actual lengths of individual keys/values encoded in ZeroCompressedInteger + format.

+ + @see CompressionCodec]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key, skipping its + value. True if another entry exists, and false at end of file.]]> + + + + + + + + key and + val. Returns true if such a pair exists and false when at + end of file]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The position passed must be a position returned by {@link + SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary + position, use {@link SequenceFile.Reader#sync(long)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + SegmentDescriptor + @param segments the list of SegmentDescriptors + @param tmpDir the directory to write temporary files into + @return RawKeyValueIterator + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For best performance, applications should make sure that the {@link + Writable#readFields(DataInput)} implementation of their keys is + very efficient. In particular, it should avoid allocating memory.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This always returns a synchronized position. In other words, + immediately after calling {@link SequenceFile.Reader#seek(long)} with a position + returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However + the key may be earlier in the file than key last written when this + method was called (e.g., with block-compression, it may be the first key + in the block that was being written when this method was called).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + key. Returns + true if such a key exists and false when at the end of the set.]]> + + + + + + + key. + Returns key, or null if no match exists.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the objects to stringify]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + position. Note that this + method avoids using the converter or doing String instatiation + @return the Unicode scalar value at position or -1 + if the position is invalid or points to a + trailing byte]]> + + + + + + + + + + what in the backing + buffer, starting as position start. The starting + position is measured in bytes and the return value is in + terms of byte position in the buffer. The backing buffer is + not converted to a string for this operation. + @return byte position of the first occurence of the search + string in the UTF-8 buffer or -1 if not found]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a Text with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException.]]> + + + + + + + + + + + + + + + replace is true, then + malformed input is replaced with the + substitution character, which is U+FFFD. Otherwise the + method throws a MalformedInputException. + @return ByteBuffer: bytes stores at ByteBuffer.array() + and length is ByteBuffer.limit()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In + addition, it provides methods for string traversal without converting the + byte array to a string.

Also includes utilities for + serializing/deserialing a string, coding/decoding a string, checking if a + byte array contains valid UTF8 code, calculating the length of an encoded + string.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a UTF8 with the same contents.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + Also includes utilities for efficiently reading and writing UTF-8. + + @deprecated replaced by Text]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This is useful when a class may evolve, so that instances written by the + old version of the class may still be processed by the new version. To + handle this situation, {@link #readFields(DataInput)} + implementations should catch {@link VersionMismatchException}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VIntWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + o is a VLongWritable with the same value.]]> + + + + + + + + + + + + + + + + + + + + + + + + out. + + @param out DataOuput to serialize this object into. + @throws IOException]]> + + + + + + + in. + +

For efficiency, implementations should attempt to re-use storage in the + existing object where possible.

+ + @param in DataInput to deseriablize this object from. + @throws IOException]]> +
+ + + Any key or value type in the Hadoop Map-Reduce + framework implements this interface.

+ +

Implementations typically implement a static read(DataInput) + method which constructs a new instance, calls {@link #readFields(DataInput)} + and returns the instance.

+ +

Example:

+

+     public class MyWritable implements Writable {
+       // Some data     
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public static MyWritable read(DataInput in) throws IOException {
+         MyWritable w = new MyWritable();
+         w.readFields(in);
+         return w;
+       }
+     }
+ 

]]> +
+ + + + + + + + WritableComparables can be compared to each other, typically + via Comparators. Any type which is to be used as a + key in the Hadoop Map-Reduce framework should implement this + interface.

+ +

Example:

+

+     public class MyWritableComparable implements WritableComparable {
+       // Some data
+       private int counter;
+       private long timestamp;
+       
+       public void write(DataOutput out) throws IOException {
+         out.writeInt(counter);
+         out.writeLong(timestamp);
+       }
+       
+       public void readFields(DataInput in) throws IOException {
+         counter = in.readInt();
+         timestamp = in.readLong();
+       }
+       
+       public int compareTo(MyWritableComparable w) {
+         int thisValue = this.value;
+         int thatValue = ((IntWritable)o).value;
+         return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+       }
+     }
+ 

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The default implementation reads the data into two {@link + WritableComparable}s (using {@link + Writable#readFields(DataInput)}, then calls {@link + #compare(WritableComparable,WritableComparable)}.]]> + + + + + + + The default implementation uses the natural ordering, calling {@link + Comparable#compareTo(Object)}.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This base implemenation uses the natural ordering. To define alternate + orderings, override {@link #compare(WritableComparable,WritableComparable)}. + +

One may optimize compare-intensive operations by overriding + {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are + provided to assist in optimized implementations of this method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Enum type + @param in DataInput to read from + @param enumType Class type of Enum + @return Enum represented by String read from DataInput + @throws IOException]]> + + + + + + + + + + + + + + + + len number of bytes in input streamin + @param in input stream + @param len number of bytes to skip + @throws IOException when skipped less number of bytes]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CompressionCodec for which to get the + Compressor + @return Compressor for the given + CompressionCodec from the pool or a new one]]> + + + + + + CompressionCodec for which to get the + Decompressor + @return Decompressor for the given + CompressionCodec the pool or a new one]]> + + + + + + Compressor to be returned to the pool]]> + + + + + + Decompressor to be returned to the + pool]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Implementations are assumed to be buffered. This permits clients to + reposition the underlying input stream then call {@link #resetState()}, + without having to also synchronize client buffers.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true indicating that more input data is required. + + @param b Input data + @param off Start offset + @param len Length]]> + + + + + true if the input data buffer is empty and + #setInput() should be called in order to provide more input.]]> + + + + + + + + + + + + + true if a preset dictionary is needed for decompression. + @return true if a preset dictionary is needed for decompression]]> + + + + + true if the end of the compressed + data output stream has been reached.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FIXME: This array should be in a private or package private location, + since it could be modified by malicious code. +

]]> +
+ + + + This interface is public for historical purposes. You should have no need to + use it. +

]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Although BZip2 headers are marked with the magic "Bz" this + constructor expects the next byte in the stream to be the first one after + the magic. Thus callers have to skip the first two bytes. Otherwise this + constructor will throw an exception. +

+ + @throws IOException + if the stream content is malformed or an I/O error occurs. + @throws NullPointerException + if in == null]]> +
+
+ + + + + + + + + + + + + + + The decompression requires large amounts of memory. Thus you should call the + {@link #close() close()} method as soon as possible, to force + CBZip2InputStream to release the allocated memory. See + {@link CBZip2OutputStream CBZip2OutputStream} for information about memory + usage. +

+ +

+ CBZip2InputStream reads bytes from the compressed source stream via + the single byte {@link java.io.InputStream#read() read()} method exclusively. + Thus you should consider to use a buffered source stream. +

+ +

+ Instances of this class are not threadsafe. +

]]> +
+
+ + + + + + + + CBZip2OutputStream with a blocksize of 900k. + +

+ Attention: The caller is resonsible to write the two BZip2 magic + bytes "BZ" to the specified stream prior to calling this + constructor. +

+ + @param out * + the destination stream. + + @throws IOException + if an I/O error occurs in the specified stream. + @throws NullPointerException + if out == null.]]> +
+
+ + + + CBZip2OutputStream with specified blocksize. + +

+ Attention: The caller is resonsible to write the two BZip2 magic + bytes "BZ" to the specified stream prior to calling this + constructor. +

+ + + @param out + the destination stream. + @param blockSize + the blockSize as 100k units. + + @throws IOException + if an I/O error occurs in the specified stream. + @throws IllegalArgumentException + if (blockSize < 1) || (blockSize > 9). + @throws NullPointerException + if out == null. + + @see #MIN_BLOCKSIZE + @see #MAX_BLOCKSIZE]]> +
+
+ + + + + + + + + + + + + inputLength this method returns MAX_BLOCKSIZE + always. + + @param inputLength + The length of the data which will be compressed by + CBZip2OutputStream.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + == 1.]]> + + + + + == 9.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If you are ever unlucky/improbable enough to get a stack overflow whilst + sorting, increase the following constant and try again. In practice I + have never seen the stack go above 27 elems, so the following limit seems + very generous. +

]]> +
+
+ + + The compression requires large amounts of memory. Thus you should call the + {@link #close() close()} method as soon as possible, to force + CBZip2OutputStream to release the allocated memory. +

+ +

+ You can shrink the amount of allocated memory and maybe raise the compression + speed by choosing a lower blocksize, which in turn may cause a lower + compression ratio. You can avoid unnecessary memory allocation by avoiding + using a blocksize which is bigger than the size of the input. +

+ +

+ You can compute the memory usage for compressing by the following formula: +

+ +
+ <code>400k + (9 * blocksize)</code>.
+ 
+ +

+ To get the memory required for decompression by {@link CBZip2InputStream + CBZip2InputStream} use +

+ +
+ <code>65k + (5 * blocksize)</code>.
+ 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Memory usage by blocksize
Blocksize Compression
+ memory usage
Decompression
+ memory usage
100k1300k565k
200k2200k1065k
300k3100k1565k
400k4000k2065k
500k4900k2565k
600k5800k3065k
700k6700k3565k
800k7600k4065k
900k8500k4565k
+ +

+ For decompression CBZip2InputStream allocates less memory if the + bzipped input is smaller than one block. +

+ +

+ Instances of this class are not threadsafe. +

+ +

+ TODO: Update to BZip2 1.0.1 +

]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @return the total (non-negative) number of uncompressed bytes input so far]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-zlib is loaded & initialized + and can be loaded for this job, else false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keep trying a limited number of times, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying for a maximum time, waiting a fixed time between attempts, + and then fail by re-throwing the exception. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by the number of tries so far. +

]]> +
+
+ + + + + + + Keep trying a limited number of times, waiting a growing amount of time between attempts, + and then fail by re-throwing the exception. + The time between attempts is sleepTime mutliplied by a random + number in the range of [0, 2 to the number of retries) +

]]> +
+
+ + + + + + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + + + A retry policy for RemoteException + Set a default policy with some explicit handlers for specific exceptions. +

]]> +
+
+ + + + Try once, and fail by re-throwing the exception. + This corresponds to having no retry mechanism in place. +

]]> +
+
+ + + + Try once, and fail silently for void methods, or by + re-throwing the exception for non-void methods. +

]]> +
+
+ + + + Keep trying forever. +

]]> +
+
+ + + A collection of useful implementations of {@link RetryPolicy}. +

]]> +
+
+ + + + + + + + + + Determines whether the framework should retry a + method for the given exception, and the number + of retries that have been made for that operation + so far. +

+ @param e The exception that caused the method to fail. + @param retries The number of times the method has been retried. + @return true if the method should be retried, + false if the method should not be retried + but shouldn't fail with an exception (only for void methods). + @throws Exception The re-thrown exception e indicating + that the method failed and should not be retried further.]]> +
+
+ + + Specifies a policy for retrying method failures. + Implementations of this interface should be immutable. +

]]> +
+
+ + + + + + + + + + + + Create a proxy for an interface of an implementation class + using the same retry policy for each method in the interface. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param retryPolicy the policy for retirying method call failures + @return the retry proxy]]> +
+
+ + + + + + + Create a proxy for an interface of an implementation class + using the a set of retry policies specified by method name. + If no retry policy is defined for a method then a default of + {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used. +

+ @param iface the interface that the retry will implement + @param implementation the instance whose methods should be retried + @param methodNameToPolicyMap a map of method names to retry policies + @return the retry proxy]]> +
+
+ + + A factory for creating retry proxies. +

]]> +
+
+ +
+ + + + + + + + Prepare the deserializer for reading.

]]> +
+
+ + + + + + Deserialize the next object from the underlying input stream. + If the object t is non-null then this deserializer + may set its internal state to the next object read from the input + stream. Otherwise, if the object t is null a new + deserialized object will be created. +

+ @return the deserialized object]]> +
+
+ + + + Close the underlying input stream and clear up any resources.

]]> +
+
+ + + Provides a facility for deserializing objects of type from an + {@link InputStream}. +

+ +

+ Deserializers are stateful, but must not buffer the input since + other producers may read from the input between calls to + {@link #deserialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + A {@link RawComparator} that uses a {@link Deserializer} to deserialize + the objects to be compared so that the standard {@link Comparator} can + be used to compare them. +

+

+ One may optimize compare-intensive operations by using a custom + implementation of {@link RawComparator} that operates directly + on byte representations. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + An experimental {@link Serialization} for Java {@link Serializable} classes. +

+ @see JavaSerializationComparator]]> +
+
+ + + + + + + + + + + + + A {@link RawComparator} that uses a {@link JavaSerialization} + {@link Deserializer} to deserialize objects that are then compared via + their {@link Comparable} interfaces. +

+ @param + @see JavaSerialization]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + Encapsulates a {@link Serializer}/{@link Deserializer} pair. +

+ @param ]]> +
+
+ + + + + + + Serializations are found by reading the io.serializations + property from conf, which is a comma-delimited list of + classnames. +

]]> +
+
+ + + + + + + + + + + + A factory for {@link Serialization}s. +

]]> +
+
+ + + + + + + + Prepare the serializer for writing.

]]> +
+
+ + + + + Serialize t to the underlying output stream.

]]> +
+
+ + + + Close the underlying output stream and clear up any resources.

]]> +
+
+ + + Provides a facility for serializing objects of type to an + {@link OutputStream}. +

+ +

+ Serializers are stateful, but must not buffer the output since + other producers may write to the output between calls to + {@link #serialize(Object)}. +

+ @param ]]> +
+
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + param, to the IPC server running at + address, returning the value. Throws exceptions if there are + network problems or if the remote code threw an exception. + @deprecated Use {@link #call(Writable, InetSocketAddress, Class, UserGroupInformation)} instead]]> + + + + + + + + + + param, to the IPC server running at + address with the ticket credentials, returning + the value. + Throws exceptions if there are network problems or if the remote code + threw an exception. + @deprecated Use {@link #call(Writable, InetSocketAddress, Class, UserGroupInformation)} instead]]> + + + + + + + + + + + param, to the IPC server running at + address which is servicing the protocol protocol, + with the ticket credentials, returning the value. + Throws exceptions if there are network problems or if the remote code + threw an exception.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unwraps any IOException. + + @param lookupTypes the desired exception class. + @return IOException, which is either the lookupClass exception or this.]]> + + + + + This unwraps any Throwable that has a constructor taking + a String as a parameter. + Otherwise it returns this. + + @return Throwable]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + protocol is a Java interface. All parameters and return types must + be one of: + +
  • a primitive type, boolean, byte, + char, short, int, long, + float, double, or void; or
  • + +
  • a {@link String}; or
  • + +
  • a {@link Writable}; or
  • + +
  • an array of the above types
+ + All methods in the protocol should throw only IOException. No field data of + the protocol instance is transmitted.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + handlerCount determines + the number of handler threads that will be used to process calls.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + ,name=RpcActivityForPort" + + Many of the activity metrics are sampled and averaged on an interval + which can be specified in the metrics config file. +

+ For the metrics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most metrics contexts do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically + + + + Impl details: We use a dynamic mbean that gets the list of the metrics + from the metrics registry passed as an argument to the constructor]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class has a number of metrics variables that are publicly accessible; + these variables (objects) have methods to update their values; + for example: +

{@link #rpcQueueTime}.inc(time)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For the statistics that are sampled and averaged, one must specify + a metrics context that does periodic update calls. Most do. + The default Null metrics context however does NOT. So if you aren't + using any other metrics context then you can turn on the viewing and averaging + of sampled metrics by specifying the following two lines + in the hadoop-meterics.properties file: +

+        rpc.class=org.apache.hadoop.metrics.spi.NullContextWithUpdateThread
+        rpc.period=10
+  
+

+ Note that the metrics are collected regardless of the context used. + The context with the update thread is used to average the data periodically]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + When constructing the instance, if the factory property + contextName.class exists, + its value is taken to be the name of the class to instantiate. Otherwise, + the default is to create an instance of + org.apache.hadoop.metrics.spi.NullContext, which is a + dummy "no-op" context which will cause all metric data to be discarded. + + @param contextName the name of the context + @return the named MetricsContext]]> + + + + + + + + + + + + + + + + + + + + + When the instance is constructed, this method checks if the file + hadoop-metrics.properties exists on the class path. If it + exists, it must be in the format defined by java.util.Properties, and all + the properties in the file are set as attributes on the newly created + ContextFactory instance. + + @return the singleton ContextFactory instance]]> + + + + getFactory() method.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + startMonitoring() again after calling + this. + @see #close()]]> + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A record name identifies the kind of data to be reported. For example, a + program reporting statistics relating to the disks on a computer might use + a record name "diskStats".

+ + A record has zero or more tags. A tag has a name and a value. To + continue the example, the "diskStats" record might use a tag named + "diskName" to identify a particular disk. Sometimes it is useful to have + more than one tag, so there might also be a "diskType" with value "ide" or + "scsi" or whatever.

+ + A record also has zero or more metrics. These are the named + values that are to be reported to the metrics system. In the "diskStats" + example, possible metric names would be "diskPercentFull", "diskPercentBusy", + "kbReadPerSecond", etc.

+ + The general procedure for using a MetricsRecord is to fill in its tag and + metric values, and then call update() to pass the record to the + client library. + Metric data is not immediately sent to the metrics system + each time that update() is called. + An internal table is maintained, identified by the record name. This + table has columns + corresponding to the tag and the metric names, and rows + corresponding to each unique set of tag values. An update + either modifies an existing row in the table, or adds a new row with a set of + tag values that are different from all the other rows. Note that if there + are no tags, then there can be at most one row in the table.

+ + Once a row is added to the table, its data will be sent to the metrics system + on every timer period, whether or not it has been updated since the previous + timer period. If this is inappropriate, for example if metrics were being + reported by some transient object in an application, the remove() + method can be used to remove the row and thus stop the data from being + sent.

+ + Note that the update() method is atomic. This means that it is + safe for different threads to be updating the same metric. More precisely, + it is OK for different threads to call update() on MetricsRecord instances + with the same set of tag names and tag values. Different threads should + not use the same MetricsRecord instance at the same time.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MetricsContext.registerUpdater().]]> + + + + + + + + + + + + + + + + + + + + + + + + + fileName attribute, + if specified. Otherwise the data will be written to standard + output.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class is configured by setting ContextFactory attributes which in turn + are usually configured through a properties file. All the attributes are + prefixed by the contextName. For example, the properties file might contain: +

+ myContextName.fileName=/tmp/metrics.log
+ myContextName.period=5
+ 
]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contextName.tableName. The returned map consists of + those attributes with the contextName and tableName stripped off.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + recordName. + Throws an exception if the metrics implementation is configured with a fixed + set of record names and recordName is not in that set. + + @param recordName the name of the record + @throws MetricsException if recordName conflicts with configuration data]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This class implements the internal table of metric data, and the timer + on which data is to be sent to the metrics system. Subclasses must + override the abstract emitRecord method in order to transmit + the data.

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + update + and remove().]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hostname or hostname:port. If + the specs string is null, defaults to localhost:defaultPort. + + @return a list of InetSocketAddress objects.]]> + + + + + + + + + + + + + + + + + + + ,name=" + Where the and are the supplied parameters + + @param serviceName + @param nameName + @param theMbean - the MBean to register + @return the named used to register the MBean]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.rpc.socket.factory.class.<ClassName>. When no + such parameter exists then fall back on the default socket factory as + configured by hadoop.rpc.socket.factory.class.default. If + this default socket factory is not configured, then fall back on the JVM + default socket factory. + + @param conf the configuration + @param clazz the class (usually a {@link VersionedProtocol}) + @return a socket factory]]> + + + + + + hadoop.rpc.socket.factory.default + + @param conf the configuration + @return the default socket factory as specified in the configuration or + the JVM default socket factory if the configuration does not + contain a default socket factory property.]]> + + + + + + + + + + + + + : + ://:/]]> + + + + + + + + : + ://:/]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketInputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getInputStream()} is returned. In the later + case, the timeout argument is ignored and the timeout set with + {@link Socket#setSoTimeout(int)} applies for reads.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see #getInputStream(Socket, long) + + @param socket + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getInputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return InputStream for reading from the socket. + @throws IOException]]> +
+
+ + + + +
+ + From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated + SocketChannel then it returns a + {@link SocketOutputStream} with the given timeout. If the socket does not + have a channel, {@link Socket#getOutputStream()} is returned. In the later + case, the timeout argument is ignored and the write will wait until + data is available.

+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see #getOutputStream(Socket, long) + + @param socket + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + +
+ + Any socket created using socket factories returned by {@link #NetUtils}, + must use this interface instead of {@link Socket#getOutputStream()}. + + @see Socket#getChannel() + + @param socket + @param timeout timeout in milliseconds. This may not always apply. zero + for waiting as long as necessary. + @return OutputStream for writing to the socket. + @throws IOException]]> +
+
+ + + + + + + socket.connect(endpoint, timeout). If + socket.getChannel() returns a non-null channel, + connect is implemented using Hadoop's selectors. This is done mainly + to avoid Sun's connect implementation from creating thread-local + selectors, since Hadoop does not have control on when these are closed + and could end up taking all the available file descriptors. + + @see java.net.Socket#connect(java.net.SocketAddress, int) + + @param socket + @param endpoint + @param timeout - timeout in milliseconds]]> + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + node + + @param node + a node + @return true if node is already in the tree; false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + scope + if scope starts with ~, choose one from the all nodes except for the + ones in scope; otherwise, choose one from scope + @param scope range of nodes from which a node will be choosen + @return the choosen node]]> + + + + + + + scope but not in excludedNodes + if scope starts with ~, return the number of nodes that are not + in scope and excludedNodes; + @param scope a path string that may start with ~ + @param excludedNodes a list of nodes + @return number of available nodes]]> + + + + + + + + + + + + reader + It linearly scans the array, if a local node is found, swap it with + the first element of the array. + If a local rack node is found, swap it with the first element following + the local node. + If neither local node or local rack node is found, put a random replica + location at postion 0. + It leaves the rest nodes untouched.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + +
+ + Create a new input stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + @see SocketInputStream#SocketInputStream(ReadableByteChannel, long) + + @param socket should have a channel associated with it. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + +
+ + Create a new ouput stream with the given timeout. If the timeout + is zero, it will be treated as infinite timeout. The socket's + channel will be configured to be non-blocking. + + @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long) + + @param socket should have a channel associated with it. + @param timeout timeout timeout in milliseconds. must not be negative. + @throws IOException]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = getCount(). + @param newCapacity The new capacity in bytes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index idx = startVector(...); + while (!idx.done()) { + .... // read element of a vector + idx.incr(); + } + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This task takes the given record definition files and compiles them into + java or c++ + files. It is then up to the user to compile the generated files. + +

The task requires the file or the nested fileset element to be + specified. Optional attributes are language (set the output + language, default is "java"), + destdir (name of the destination directory for generated java/c++ + code, default is ".") and failonerror (specifies error handling + behavior. default is true). +

Usage

+
+ <recordcc
+       destdir="${basedir}/gensrc"
+       language="java">
+   <fileset include="**\/*.jr" />
+ </recordcc>
+ 
]]> +
+
+ +
+ + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cause==null ? null : cause.toString()) (which + typically contains the class and detail message of cause). + @param cause the cause (which is saved for later retrieval by the + {@link #getCause()} method). (A null value is + permitted, and indicates that the cause is nonexistent or + unknown.)]]> + + + + + + + + + + + + + Group with the given groupname. + @param group group name]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi. + @param ugi user + @return the {@link Subject} for the user identified by ugi]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ugi as a comma separated string in + conf as a property attr + + The String starts with the user name followed by the default group names, + and other group names. + + @param conf configuration + @param attr property name + @param ugi a UnixUserGroupInformation]]> + + + + + + + + conf + + The object is expected to store with the property name attr + as a comma separated string that starts + with the user name followed by group names. + If the property name is not defined, return null. + It's assumed that there is only one UGI per user. If this user already + has a UGI in the ugi map, return the ugi in the map. + Otherwise, construct a UGI from the configuration, store it in the + ugi map and return it. + + @param conf configuration + @param attr property name + @return a UnixUGI + @throws LoginException if the stored string is ill-formatted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + User with the given username. + @param user user name]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cause==null ? null : cause.toString()) (which + typically contains the class and detail message of cause). + @param cause the cause (which is saved for later retrieval by the + {@link #getCause()} method). (A null value is + permitted, and indicates that the cause is nonexistent or + unknown.)]]> + + + + + + + + + + + + + + does not provide the stack trace for security purposes.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + service as related to + Service Level Authorization for Hadoop. + + Each service defines it's configuration key and also the necessary + {@link Permission} required to access the service.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + in]]> + + + + + + + out.]]> + + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + reset is true, then resets the checksum. + @return number of bytes written. Will be equal to getChecksumSize();]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GenericOptionsParser to parse only the generic Hadoop + arguments. + + The array of string arguments other than the generic arguments can be + obtained by {@link #getRemainingArgs()}. + + @param conf the Configuration to modify. + @param args command-line arguments.]]> + + + + + GenericOptionsParser to parse given options as well + as generic Hadoop options. + + The resulting CommandLine object can be obtained by + {@link #getCommandLine()}. + + @param conf the configuration to modify + @param options options built by the caller + @param args User-specified arguments]]> + + + + + Strings containing the un-parsed arguments + or empty array if commandLine was not defined.]]> + + + + + + + + + + CommandLine object + to process the parsed arguments. + + Note: If the object is created with + {@link #GenericOptionsParser(Configuration, String[])}, then returned + object will only contain parsed generic options. + + @return CommandLine representing list of arguments + parsed against Options descriptor.]]> + + + + + + + + + + + + + + + + + GenericOptionsParser is a utility to parse command line + arguments generic to the Hadoop framework. + + GenericOptionsParser recognizes several standarad command + line arguments, enabling applications to easily specify a namenode, a + jobtracker, additional configuration resources etc. + +

Generic Options

+ +

The supported generic options are:

+

+     -conf <configuration file>     specify a configuration file
+     -D <property=value>            use value for given property
+     -fs <local|namenode:port>      specify a namenode
+     -jt <local|jobtracker:port>    specify a job tracker
+     -files <comma separated list of files>    specify comma separated
+                            files to be copied to the map reduce cluster
+     -libjars <comma separated list of jars>   specify comma separated
+                            jar files to include in the classpath.
+     -archives <comma separated list of archives>    specify comma
+             separated archives to be unarchived on the compute machines.
+
+ 

+ +

The general command line syntax is:

+

+ bin/hadoop command [genericOptions] [commandOptions]
+ 

+ +

Generic command line arguments might modify + Configuration objects, given to constructors.

+ +

The functionality is implemented using Commons CLI.

+ +

Examples:

+

+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+ 
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+     
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+     
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+     
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+ 
+ $ bin/hadoop jar -libjars testlib.jar 
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+ 

+ + @see Tool + @see ToolRunner]]> +
+
+ + + + + + + + + Class<T>) of the + argument of type T. + @param The type of the argument + @param t the object to get it class + @return Class<T>]]> + + + + + + + List<T> to a an array of + T[]. + @param c the Class object of the items in the list + @param list the list to convert]]> + + + + + + List<T> to a an array of + T[]. + @param list the list to convert + @throws ArrayIndexOutOfBoundsException if the list is empty. + Use {@link #toArray(Class, List)} if the list may be empty.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + io.file.buffer.size specified in the given + Configuration. + @param in input stream + @param conf configuration + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if native-hadoop is loaded, + else false]]> + + + + + + true if native hadoop libraries, if present, can be + used for this job; false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { pq.top().change(); pq.adjustTop(); } + instead of
+  { o = pq.pop(); o.change(); pq.push(o); }
+ 
]]> +
+
+ + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Clients and/or applications can use the provided Progressable + to explicitly report progress to the Hadoop framework. This is especially + important for operations which take an insignificant amount of time since, + in-lieu of the reported progress, the framework has to assume that an error + has occured and time-out the operation.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Class is to be obtained + @return the correctly typed Class of the given object.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop Pipes + or Hadoop Streaming. + + It also checks to ensure that we are running on a *nix platform else + (e.g. in Cygwin/Windows) it returns null. + @param conf configuration + @return a String[] with the ulimit command arguments or + null if we are running on a non *nix platform or + if the limit is unspecified.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell interface. + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + Shell interface. + @param env the map of environment key=value + @param cmd shell command to execute. + @return the output of the executed command.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + Shell can be used to run unix commands like du or + df. It also offers facilities to gate commands by + time-intervals.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ShellCommandExecutorshould be used in cases where the output + of the command needs no explicit parsing and where the command, working + directory and the environment remains unchanged. The output of the command + is stored as-is and is expected to be small.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ArrayList of string values]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the char to be escaped + @return an escaped string]]> + + + + + + + + + + + + + + + + + + + + + + charToEscape in the string + with the escape char escapeChar + + @param str string + @param escapeChar escape char + @param charToEscape the escaped char + @return an unescaped string]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tool, is the standard for any Map-Reduce tool/application. + The tool/application should delegate the handling of + + standard command-line options to {@link ToolRunner#run(Tool, String[])} + and only handle its custom arguments.

+ +

Here is how a typical Tool is implemented:

+

+     public class MyApp extends Configured implements Tool {
+     
+       public int run(String[] args) throws Exception {
+         // Configuration processed by ToolRunner
+         Configuration conf = getConf();
+         
+         // Create a JobConf using the processed conf
+         JobConf job = new JobConf(conf, MyApp.class);
+         
+         // Process custom command-line options
+         Path in = new Path(args[1]);
+         Path out = new Path(args[2]);
+         
+         // Specify various job-specific parameters     
+         job.setJobName("my-app");
+         job.setInputPath(in);
+         job.setOutputPath(out);
+         job.setMapperClass(MyApp.MyMapper.class);
+         job.setReducerClass(MyApp.MyReducer.class);
+
+         // Submit the job, then poll for progress until the job is complete
+         JobClient.runJob(job);
+       }
+       
+       public static void main(String[] args) throws Exception {
+         // Let ToolRunner handle generic command-line options 
+         int res = ToolRunner.run(new Configuration(), new Sort(), args);
+         
+         System.exit(res);
+       }
+     }
+ 

+ + @see GenericOptionsParser + @see ToolRunner]]> +
+
+ + + + + + + + + + + + Tool by {@link Tool#run(String[])}, after + parsing with the given generic arguments. Uses the given + Configuration, or builds one if null. + + Sets the Tool's configuration with the possibly modified + version of the conf. + + @param conf Configuration for the Tool. + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + Tool with its Configuration. + + Equivalent to run(tool.getConf(), tool, args). + + @param tool Tool to run. + @param args command-line arguments to the tool. + @return exit code of the {@link Tool#run(String[])} method.]]> + + + + + + + + + + ToolRunner can be used to run classes implementing + Tool interface. It works in conjunction with + {@link GenericOptionsParser} to parse the + + generic hadoop command line arguments and modifies the + Configuration of the Tool. The + application-specific options are passed along without being modified. +

+ + @see Tool + @see GenericOptionsParser]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + this filter. + @param nbHash The number of hash function to consider. + @param hashType type of the hashing function (see + {@link org.apache.hadoop.util.hash.Hash}).]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Bloom filter, as defined by Bloom in 1970. +

+ The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by + the networking research community in the past decade thanks to the bandwidth efficiencies that it + offers for the transmission of set membership information between networked hosts. A sender encodes + the information into a bit vector, the Bloom filter, that is more compact than a conventional + representation. Computation and space costs for construction are linear in the number of elements. + The receiver uses the filter to test whether various elements are members of the set. Though the + filter will occasionally return a false positive, it will never return a false negative. When creating + the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size. + +

+ Originally created by + European Commission One-Lab Project 034819. + + @see Filter The general behavior of a filter + + @see Space/Time Trade-Offs in Hash Coding with Allowable Errors]]> + + + + + + + + + + + + + this filter. + @param nbHash The number of hash function to consider. + @param hashType type of the hashing function (see + {@link org.apache.hadoop.util.hash.Hash}).]]> + + + + + + + + + this counting Bloom filter. +

+ Invariant: nothing happens if the specified key does not belong to this counter Bloom filter. + @param key The key to remove.]]> + + + + + + + + + + + + key -> count map. +

NOTE: due to the bucket size of this filter, inserting the same + key more than 15 times will cause an overflow at all filter positions + associated with this key, and it will significantly increase the error + rate for this and other keys. For this reason the filter can only be + used to store small count values 0 <= N << 15. + @param key key to be tested + @return 0 if the key is not present. Otherwise, a positive value v will + be returned such that v == count with probability equal to the + error rate of this filter, and v > count otherwise. + Additionally, if the filter experienced an underflow as a result of + {@link #delete(Key)} operation, the return value may be lower than the + count with the probability of the false negative rate of such + filter.]]> + + + + + + + + + + + + + + + + + + + + + + counting Bloom filter, as defined by Fan et al. in a ToN + 2000 paper. +

+ A counting Bloom filter is an improvement to standard a Bloom filter as it + allows dynamic additions and deletions of set membership information. This + is achieved through the use of a counting vector instead of a bit vector. +

+ Originally created by + European Commission One-Lab Project 034819. + + @see Filter The general behavior of a filter + + @see Summary cache: a scalable wide-area web cache sharing protocol]]> + + + + + + + + + + + + + + Builds an empty Dynamic Bloom filter. + @param vectorSize The number of bits in the vector. + @param nbHash The number of hash function to consider. + @param hashType type of the hashing function (see + {@link org.apache.hadoop.util.hash.Hash}). + @param nr The threshold for the maximum number of keys to record in a + dynamic Bloom filter row.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + dynamic Bloom filter, as defined in the INFOCOM 2006 paper. +

+ A dynamic Bloom filter (DBF) makes use of a s * m bit matrix but + each of the s rows is a standard Bloom filter. The creation + process of a DBF is iterative. At the start, the DBF is a 1 * m + bit matrix, i.e., it is composed of a single standard Bloom filter. + It assumes that nr elements are recorded in the + initial bit vector, where nr <= n (n is + the cardinality of the set A to record in the filter). +

+ As the size of A grows during the execution of the application, + several keys must be inserted in the DBF. When inserting a key into the DBF, + one must first get an active Bloom filter in the matrix. A Bloom filter is + active when the number of recorded keys, nr, is + strictly less than the current cardinality of A, n. + If an active Bloom filter is found, the key is inserted and + nr is incremented by one. On the other hand, if there + is no active Bloom filter, a new one is created (i.e., a new row is added to + the matrix) according to the current size of A and the element + is added in this new Bloom filter and the nr value of + this new Bloom filter is set to one. A given key is said to belong to the + DBF if the k positions are set to one in one of the matrix rows. +

+ Originally created by + European Commission One-Lab Project 034819. + + @see Filter The general behavior of a filter + @see BloomFilter A Bloom filter + + @see Theory and Network Applications of Dynamic Bloom Filters]]> + + + + + + + + + + + this filter. + @param nbHash The number of hash functions to consider. + @param hashType type of the hashing function (see {@link Hash}).]]> + + + + + + this filter. + @param key The key to add.]]> + + + + + + this filter. + @param key The key to test. + @return boolean True if the specified key belongs to this filter. + False otherwise.]]> + + + + + + this filter and a specified filter. +

+ Invariant: The result is assigned to this filter. + @param filter The filter to AND with.]]> + + + + + + this filter and a specified filter. +

+ Invariant: The result is assigned to this filter. + @param filter The filter to OR with.]]> + + + + + + this filter and a specified filter. +

+ Invariant: The result is assigned to this filter. + @param filter The filter to XOR with.]]> + + + + + this filter. +

+ The result is assigned to this filter.]]> + + + + + + this filter. + @param keys The list of keys.]]> + + + + + + this filter. + @param keys The collection of keys.]]> + + + + + + this filter. + @param keys The array of keys.]]> + + + + + + + + + + + + + this filter.]]> + + + + + + + + + + + + + + + + + + + + A filter is a data structure which aims at offering a lossy summary of a set A. The + key idea is to map entries of A (also called keys) into several positions + in a vector through the use of several hash functions. +

+ Typically, a filter will be implemented as a Bloom filter (or a Bloom filter extension). +

+ It must be extended in order to define the real behavior. + + @see Key The general behavior of a key + @see HashFunction A hash function]]> + + + + + + + + + Builds a hash function that must obey to a given maximum number of returned values and a highest value. + @param maxValue The maximum highest returned value. + @param nbHash The number of resulting hashed values. + @param hashType type of the hashing function (see {@link Hash}).]]> + + + + + this hash function. A NOOP]]> + + + + + + + + + + + + + + + + + + + + + + + + + Builds a key with a default weight. + @param value The byte value of this key.]]> + + + + + + Builds a key with a specified weight. + @param value The value of this key. + @param weight The weight associated to this key.]]> + + + + + + + + + + + + this key.]]> + + + + + this key.]]> + + + + + + this key with a specified value. + @param weight The increment.]]> + + + + + this key by one.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The idea is to randomly select a bit to reset.]]> + + + + + + The idea is to select the bit to reset that will generate the minimum + number of false negative.]]> + + + + + + The idea is to select the bit to reset that will remove the maximum number + of false positive.]]> + + + + + + The idea is to select the bit to reset that will, at the same time, remove + the maximum number of false positve while minimizing the amount of false + negative generated.]]> + + + + + Originally created by + European Commission One-Lab Project 034819.]]> + + + + + + + + + + + + + + this filter. + @param nbHash The number of hash function to consider. + @param hashType type of the hashing function (see + {@link org.apache.hadoop.util.hash.Hash}).]]> + + + + + + + + + this retouched Bloom filter. +

+ Invariant: if the false positive is null, nothing happens. + @param key The false positive key to add.]]> + + + + + + this retouched Bloom filter. + @param coll The collection of false positive.]]> + + + + + + this retouched Bloom filter. + @param keys The list of false positive.]]> + + + + + + this retouched Bloom filter. + @param keys The array of false positive.]]> + + + + + + + this retouched Bloom filter. + @param scheme The selective clearing scheme to apply.]]> + + + + + + + + + + + + retouched Bloom filter, as defined in the CoNEXT 2006 paper. +

+ It allows the removal of selected false positives at the cost of introducing + random false negatives, and with the benefit of eliminating some random false + positives at the same time. + +

+ Originally created by + European Commission One-Lab Project 034819. + + @see Filter The general behavior of a filter + @see BloomFilter A Bloom filter + @see RemoveScheme The different selective clearing algorithms + + @see Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + length, and + the provided seed value + @param bytes input bytes + @param length length of the valid bytes to consider + @param initval seed value + @return hash value]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The best hash table sizes are powers of 2. There is no need to do mod + a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask. + For example, if you need only 10 bits, do + h = (h & hashmask(10)); + In which case, the hash table should have hashsize(10) elements. + +

If you are hashing n strings byte[][] k, do it like this: + for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h); + +

By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this + code any way you wish, private, educational, or commercial. It's free. + +

Use for hash table lookup, or anything where one collision in 2^^32 is + acceptable. Do NOT use for cryptographic purposes.]]> + + + + + + + + + + + lookup3.c, by Bob Jenkins, May 2006, Public Domain. + + You can use this free for any purpose. It's in the public domain. + It has no warranty. + + + @see lookup3.c + @see Hash Functions (and how this + function compares to others such as CRC, MD?, etc + @see Has update on the + Dr. Dobbs Article]]> + + + + + + + + + + + + + + + + The C version of MurmurHash 2.0 found at that site was ported + to Java by Andrzej Bialecki (ab at getopt org).

]]> +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker, + as {@link JobTracker.State} + + @return the current state of the JobTracker.]]> + + + + + JobTracker + + @return the size of heap memory used by the JobTracker]]> + + + + + JobTracker + + @return the configured size of max heap memory that can be used by the JobTracker]]> + + + + + + + + + + + + ClusterStatus provides clients with information such as: +
    +
  1. + Size of the cluster. +
  2. +
  3. + Name of the trackers. +
  4. +
  5. + Task capacity of the cluster. +
  6. +
  7. + The number of currently running map & reduce tasks. +
  8. +
  9. + State of the JobTracker. +
  10. +

+ +

Clients can query for the latest ClusterStatus, via + {@link JobClient#getClusterStatus()}.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter can be of + any {@link Enum} type.

+ +

Counters are bunched into {@link Group}s, each comprising of + counters from a particular Enum class. + @deprecated Use {@link org.apache.hadoop.mapreduce.Counters} instead.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of counters, comprising of counters from a particular + counter {@link Enum} class. + +

Grouphandles localization of the class name and the + counter names.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param fs the file system that the file is on + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides a generic implementation of + {@link #getSplits(JobConf, int)}. + Subclasses of FileInputFormat can also override the + {@link #isSplitable(FileSystem, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s. + @deprecated Use {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat} + instead.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Note: The following is valid only if the {@link OutputCommitter} + is {@link FileOutputCommitter}. If OutputCommitter is not + a FileOutputCommitter, the task's temporary output + directory is same as {@link #getOutputPath(JobConf)} i.e. + ${mapred.output.dir}$

+ +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the attemptid, say + attempt_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in ${mapred.work.output.dir} during execution + of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the + framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

Note: the value of ${mapred.work.output.dir} during + execution of a particular task-attempt is actually + ${mapred.output.dir}/_temporary/_{$taskid}, and this value is + set by the map-reduce framework. So, just create any side-files in the + path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce + task to take advantage of this feature.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+
+ + + + + + + + + + + + + The generated name can be used to create custom files from within the + different tasks for the job, the names for different tasks will not collide + with each other.

+ +

The given name is postfixed with the task type, 'm' for maps, 'r' for + reduces and the task partition number. For example, give a name 'test' + running on the first map o the job the generated name will be + 'test-m-00000'.

+ + @param conf the configuration for the job. + @param name the name to make unique. + @return a unique name accross all tasks of the job.]]> +
+
+ + + + + The path can be used to create custom files from within the map and + reduce tasks. The path name will be unique for each task. The path parent + will be the job output directory.

ls + +

This method uses the {@link #getUniqueName} method to make the file name + unique for the task.

+ + @param conf the configuration for the job. + @param name the name for the file. + @return a unique path accross all tasks of the job.]]> +
+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. + + @param job job configuration. + @param numSplits the desired number of splits, a hint. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + It is the responsibility of the RecordReader to respect + record boundaries while processing the logical split to present a + record-oriented view to the individual task.

+ + @param split the {@link InputSplit} + @param job the job that this split belongs to + @return a {@link RecordReader}]]> +
+
+ + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibilty to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see JobClient + @see FileInputFormat + @deprecated Use {@link org.apache.hadoop.mapreduce.InputFormat} instead.]]> + + + + + + + + + + InputSplit. + + @return the number of bytes in the input split. + @throws IOException]]> + + + + + + InputSplit is + located as an array of Strings. + @throws IOException]]> + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader + @deprecated Use {@link org.apache.hadoop.mapreduce.InputSplit} instead.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + jobid doesn't correspond to any known job. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobClient is the primary interface for the user-job to interact + with the {@link JobTracker}. + + JobClient provides facilities to submit jobs, track their + progress, access component-tasks' reports/logs, get the Map-Reduce cluster + status information etc. + +

The job submission process involves: +

    +
  1. + Checking the input and output specifications of the job. +
  2. +
  3. + Computing the {@link InputSplit}s for the job. +
  4. +
  5. + Setup the requisite accounting information for the {@link DistributedCache} + of the job, if necessary. +
  6. +
  7. + Copying the job's jar and configuration to the map-reduce system directory + on the distributed file-system. +
  8. +
  9. + Submitting the job to the JobTracker and optionally monitoring + it's status. +
  10. +

+ + Normally the user creates the application, describes various facets of the + job via {@link JobConf} and then uses the JobClient to submit + the job and monitor its progress. + +

Here is an example on how to use JobClient:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     job.setInputPath(new Path("in"));
+     job.setOutputPath(new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+
+     // Submit the job, then poll for progress until the job is complete
+     JobClient.runJob(job);
+ 

+ +

Job Control

+ +

At times clients would chain map-reduce jobs to accomplish complex tasks + which cannot be done via a single map-reduce job. This is fairly easy since + the output of the job, typically, goes to distributed file-system and that + can be used as the input for the next job.

+ +

However, this also means that the onus on ensuring jobs are complete + (success/failure) lies squarely on the clients. In such situations the + various job-control options are: +

    +
  1. + {@link #runJob(JobConf)} : submits the job and returns only after + the job has completed. +
  2. +
  3. + {@link #submitJob(JobConf)} : only submits the job, then poll the + returned handle to the {@link RunningJob} to query status and make + scheduling decisions. +
  4. +
  5. + {@link JobConf#setJobEndNotificationURI(String)} : setup a notification + on job-completion, thus avoiding polling. +
  6. +

+ + @see JobConf + @see ClusterStatus + @see Tool + @see DistributedCache]]> +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If the parameter {@code loadDefaults} is false, the new instance + will not load resources from the default files. + + @param loadDefaults specifies whether to load from the default files]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if framework should keep the intermediate files + for failed tasks, false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the outputs of the maps are to be compressed, + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This comparator should be provided if the equivalence rules for keys + for sorting the intermediates are different from those for grouping keys + before each call to + {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.

+ +

For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed + in a single call to the reduce function if K1 and K2 compare as equal.

+ +

Since {@link #setOutputKeyComparatorClass(Class)} can be used to control + how keys are sorted, this can be used in conjunction to simulate + secondary sort on values.

+ +

Note: This is not a guarantee of the reduce sort being + stable in any sense. (In any case, with the order of available + map-outputs to the reduce being non-deterministic, it wouldn't make + that much sense.)

+ + @param theClass the comparator class to be used for grouping keys. + It should implement RawComparator. + @see #setOutputKeyComparatorClass(Class)]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. Typically the combiner is same as the + the {@link Reducer} for the job i.e. {@link #getReducerClass()}. + + @return the user-defined combiner class used to combine map-outputs.]]> + + + + + + combiner class used to combine map-outputs + before being sent to the reducers. + +

The combiner is an application-specified aggregation operation, which + can help cut down the amount of data transferred between the + {@link Mapper} and the {@link Reducer}, leading to better performance.

+ +

The framework may invoke the combiner 0, 1, or multiple times, in both + the mapper and reducer tasks. In general, the combiner is called as the + sort/merge result is written to disk. The combiner must: +

    +
  • be side-effect free
  • +
  • have the same input and output key types and the same input and + output value types
  • +

+ +

Typically the combiner is same as the Reducer for the + job i.e. {@link #setReducerClass(Class)}.

+ + @param theClass the user-defined combiner class used to combine + map-outputs.]]> +
+
+ + + true. + + @return true if speculative execution be used for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on, else false.]]> + + + + + true. + + @return true if speculative execution be + used for this job for map tasks, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for map tasks, + else false.]]> + + + + + true. + + @return true if speculative execution be used + for reduce tasks for this job, + false otherwise.]]> + + + + + + true if speculative execution + should be turned on for reduce tasks, + else false.]]> + + + + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + Note: This is only a hint to the framework. The actual + number of spawned map tasks depends on the number of {@link InputSplit}s + generated by the job's {@link InputFormat#getSplits(JobConf, int)}. + + A custom {@link InputFormat} is typically used to accurately control + the number of map tasks for the job.

+ +

How many maps?

+ +

The number of maps is usually driven by the total size of the inputs + i.e. total number of blocks of the input files.

+ +

The right level of parallelism for maps seems to be around 10-100 maps + per-node, although it has been set up to 300 or so for very cpu-light map + tasks. Task setup takes awhile, so it is best if the maps take at least a + minute to execute.

+ +

The default behavior of file-based {@link InputFormat}s is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of input files. However, the {@link FileSystem} blocksize of the + input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Thus, if you expect 10TB of input data and have a blocksize of 128MB, + you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is + used to set it even higher.

+ + @param n the number of map tasks for this job. + @see InputFormat#getSplits(JobConf, int) + @see FileInputFormat + @see FileSystem#getDefaultBlockSize() + @see FileStatus#getBlockSize()]]> +
+
+ + + 1. + + @return the number of reduce tasks for this job.]]> + + + + + + How many reduces? + +

The right number of reduces seems to be 0.95 or + 1.75 multiplied by (<no. of nodes> * + + mapred.tasktracker.reduce.tasks.maximum). +

+ +

With 0.95 all of the reduces can launch immediately and + start transfering map outputs as the maps finish. With 1.75 + the faster nodes will finish their first round of reduces and launch a + second wave of reduces doing a much better job of load balancing.

+ +

Increasing the number of reduces increases the framework overhead, but + increases load balancing and lowers the cost of failures.

+ +

The scaling factors above are slightly less than whole numbers to + reserve a few reduce slots in the framework for speculative-tasks, failures + etc.

+ +

Reducer NONE

+ +

It is legal to set the number of reduce-tasks to zero.

+ +

In this case the output of the map-tasks directly go to distributed + file-system, to the path set by + {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the + framework doesn't sort the map-outputs before writing it out to HDFS.

+ + @param n the number of reduce tasks for this job.]]> +
+
+ + + mapred.map.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per map task.]]> + + + + + + + + + + + mapred.reduce.max.attempts + property. If this property is not already set, the default is 4 attempts. + + @return the max number of attempts per reduce task.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noFailures, the + tasktracker is blacklisted for this job. + + @param noFailures maximum no. of failures of a given job per tasktracker.]]> + + + + + blacklisted for this job. + + @return the maximum no. of failures of a given job per tasktracker.]]> + + + + + failed. + + Defaults to zero, i.e. any failed map-task results in + the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of map tasks that can fail without + the job being aborted.]]> + + + + + failed. + + Defaults to zero, i.e. any failed reduce-task results + in the job being declared as {@link JobStatus#FAILED}. + + @return the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + failed. + + @param percent the maximum percentage of reduce tasks that can fail without + the job being aborted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The debug script can aid debugging of failed map tasks. The script is + given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script needs to be symlinked.

+ +

Here is an example on how to submit a script +

+ job.setMapDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param mDbgScript the script name]]> +
+
+ + + + + + + + + The debug script can aid debugging of failed reduce tasks. The script + is given task's stdout, stderr, syslog, jobconf files as arguments.

+ +

The debug command, run on the node where the map failed, is:

+

+ $script $stdout $stderr $syslog $jobconf. +

+ +

The script file is distributed through {@link DistributedCache} + APIs. The script file needs to be symlinked

+ +

Here is an example on how to submit a script +

+ job.setReduceDebugScript("./myscript");
+ DistributedCache.createSymlink(job);
+ DistributedCache.addCacheFile("/debug/scripts/myscript#myscript");
+ 

+ + @param rDbgScript the script name]]> +
+
+ + + + + + + + null if it hasn't + been set. + @see #setJobEndNotificationURI(String)]]> + + + + + + The uri can contain 2 special parameters: $jobId and + $jobStatus. Those, if present, are replaced by the job's + identifier and completion-status respectively.

+ +

This is typically used by application-writers to implement chaining of + Map-Reduce jobs in an asynchronous manner.

+ + @param uri the job end notification uri + @see JobStatus + @see Job Completion and Chaining]]> +
+
+ + + + When a job starts, a shared directory is created at location + + ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ . + This directory is exposed to the users through + job.local.dir . + So, the tasks can use this space + as scratch space and share files among them.

+ This value is available as System property also. + + @return The localized job specific shared directory]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + If a job doesn't specify its virtual memory requirement by setting + {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to {@link #DISABLED_MEMORY_LIMIT}, + tasks are assured a memory limit set to this property. This property is + disabled by default, and if not explicitly set to a valid value by the + administrators and if a job doesn't specify its virtual memory + requirements, the job's tasks will not be assured anything and may be + killed by a TT that intends to control the total memory usage of the tasks + via memory management functionality. + +

+ + This value should in general be less than the cluster-wide configuration + {@link #UPPER_LIMIT_ON_TASK_VMEM_PROPERTY} . If not or if it not set, + TaskTracker's memory management may be disabled and a scheduler's memory + based scheduling decisions will be affected. Please refer to the + documentation of the configured scheduler to see how this property is used.]]> + + + + + + + This value will be used by TaskTrackers for monitoring the memory usage of + tasks of this jobs. If a TaskTracker's memory management functionality is + enabled, each task of this job will be allowed to use a maximum virtual + memory specified by this property. If the task's memory usage goes over + this value, the task will be failed by the TT. If not set, the cluster-wide + configuration {@link #MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY} is used as the + default value for memory requirements. If this property cascaded with + {@link #MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY} becomes equal to -1, job's + tasks will not be assured anything and may be killed by a TT that intends + to control the total memory usage of the tasks via memory management + functionality. If the memory management functionality is disabled on a TT, + this value is ignored. + +

+ + This value should also be not more than the cluster-wide configuration + {@link #UPPER_LIMIT_ON_TASK_VMEM_PROPERTY} which has to be set by the site + administrators. + +

+ + This value may be used by schedulers that support scheduling based on job's + memory requirements. In general, a task of this job will be scheduled on a + TaskTracker only if the amount of virtual memory still unoccupied on the + TaskTracker is greater than or equal to this value. But different + schedulers can take different decisions. Please refer to the documentation + of the scheduler being configured to see if it does memory based scheduling + and if it does, how this property is used by that scheduler. + + @see #setMaxVirtualMemoryForTask(long) + @see #getMaxVirtualMemoryForTask()]]> + + + + + + + This value may be used by schedulers that support scheduling based on job's + memory requirements. In general, a task of this job will be scheduled on a + TaskTracker, only if the amount of physical memory still unoccupied on the + TaskTracker is greater than or equal to this value. But different + schedulers can take different decisions. Please refer to the documentation + of the scheduler being configured to see how it does memory based + scheduling and how this variable is used by that scheduler. + + @see #setMaxPhysicalMemoryForTask(long) + @see #getMaxPhysicalMemoryForTask()]]> + + + + + + + If it is not set on a TaskTracker, TaskTracker's memory management will be + disabled.]]> + + + + JobConf is the primary interface for a user to describe a + map-reduce job to the Hadoop framework for execution. The framework tries to + faithfully execute the job as-is described by JobConf, however: +

    +
  1. + Some configuration parameters might have been marked as + + final by administrators and hence cannot be altered. +
  2. +
  3. + While some job parameters are straight-forward to set + (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly + rest of the framework and/or job-configuration and is relatively more + complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}). +
  4. +

+ +

JobConf typically specifies the {@link Mapper}, combiner + (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and + {@link OutputFormat} implementations to be used etc. + +

Optionally JobConf is used to specify other advanced facets + of the job such as Comparators to be used, files to be put in + the {@link DistributedCache}, whether or not intermediate and/or job outputs + are to be compressed (and how), debugability via user-provided scripts + ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}), + for doing post-processing on task logs, task's stdout, stderr, syslog. + and etc.

+ +

Here is an example on how to configure a job via JobConf:

+

+     // Create a new JobConf
+     JobConf job = new JobConf(new Configuration(), MyJob.class);
+     
+     // Specify various job-specific parameters     
+     job.setJobName("myjob");
+     
+     FileInputFormat.setInputPaths(job, new Path("in"));
+     FileOutputFormat.setOutputPath(job, new Path("out"));
+     
+     job.setMapperClass(MyJob.MyMapper.class);
+     job.setCombinerClass(MyJob.MyReducer.class);
+     job.setReducerClass(MyJob.MyReducer.class);
+     
+     job.setInputFormat(SequenceFileInputFormat.class);
+     job.setOutputFormat(SequenceFileOutputFormat.class);
+ 

+ + @see JobClient + @see ClusterStatus + @see Tool + @see DistributedCache + @deprecated Use {@link Configuration} instead]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + any job + run on the jobtracker started at 200707121733, we would use : +
 
+ JobID.getTaskIDsPattern("200707121733", null);
+ 
+ which will return : +
 "job_200707121733_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @return a regex pattern matching JobIDs]]> +
+
+ + + An example JobID is : + job_200707121733_0003 , which represents the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse JobID strings, but rather + use appropriate constructors or {@link #forName(String)} method. + + @see TaskID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + "N/A" + + @return Scheduling information associated to particular Job Queue]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + zero. + + @param conf configuration for the JobTracker. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Output pairs need not be of the same types as input pairs. A given + input pair may map to zero or many output pairs. Output pairs are + collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the input key. + @param value the input value. + @param output collects mapped keys and values. + @param reporter facility to report progress.]]> +
+ + + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link JobConf} for the + job via the {@link JobConfigurable#configure(JobConf)} and initialize + themselves. Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

The framework then calls + {@link #map(Object, Object, OutputCollector, Reporter)} + for each key/value pair in the InputSplit for that task.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the grouping by specifying + a Comparator via + {@link JobConf#setOutputKeyComparatorClass(Class)}.

+ +

The grouped Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

The intermediate, grouped outputs are always stored in + {@link SequenceFile}s. Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the JobConf.

+ +

If the job has + zero + reduces then the output of the Mapper is directly written + to the {@link FileSystem} without grouping by keys.

+ +

Example:

+

+     public class MyMapper<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Mapper<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+       
+       private String mapTaskId;
+       private String inputFile;
+       private int noRecords = 0;
+       
+       public void configure(JobConf job) {
+         mapTaskId = job.get("mapred.task.id");
+         inputFile = job.get("map.input.file");
+       }
+       
+       public void map(K key, V val,
+                       OutputCollector<K, V> output, Reporter reporter)
+       throws IOException {
+         // Process the <key, value> pair (assume this takes a while)
+         // ...
+         // ...
+         
+         // Let the framework know that we are alive, and kicking!
+         // reporter.progress();
+         
+         // Process some more
+         // ...
+         // ...
+         
+         // Increment the no. of <key, value> pairs processed
+         ++noRecords;
+
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+        
+         // Every 100 records update application-level status
+         if ((noRecords%100) == 0) {
+           reporter.setStatus(mapTaskId + " processed " + noRecords + 
+                              " from input-file: " + inputFile); 
+         }
+         
+         // Output the result
+         output.collect(key, val);
+       }
+     }
+ 

+ +

Applications may write a custom {@link MapRunnable} to exert greater + control on map processing e.g. multi-threaded Mappers etc.

+ + @see JobConf + @see InputFormat + @see Partitioner + @see Reducer + @see MapReduceBase + @see MapRunnable + @see SequenceFile + @deprecated Use {@link org.apache.hadoop.mapreduce.Mapper} instead.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + Provides default no-op implementations for a few methods, most non-trivial + applications need to override some of them.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + <key, value> pairs. + +

Mapping of input records to output records is complete when this method + returns.

+ + @param input the {@link RecordReader} to read the input records. + @param output the {@link OutputCollector} to collect the outputrecords. + @param reporter {@link Reporter} to report progress, status-updates etc. + @throws IOException]]> +
+
+ + Custom implementations of MapRunnable can exert greater + control on map processing e.g. multi-threaded, asynchronous mappers etc.

+ + @see Mapper + @deprecated Use {@link org.apache.hadoop.mapreduce.Mapper} instead.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nearly + equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)} + to construct RecordReader's for MultiFileSplit's. + @see MultiFileSplit + @deprecated Use {@link org.apache.hadoop.mapred.lib.CombineFileInputFormat} instead]]> +
+
+ + + + + + + + + + + + + MultiFileSplit can be used to implement {@link RecordReader}'s, with + reading one record per file. + @see FileSplit + @see MultiFileInputFormat + @deprecated Use {@link org.apache.hadoop.mapred.lib.CombineFileSplit} instead]]> + + + + + + + + + + + + + + + <key, value> pairs output by {@link Mapper}s + and {@link Reducer}s. + +

OutputCollector is the generalization of the facility + provided by the Map-Reduce framework to collect data output by either the + Mapper or the Reducer i.e. intermediate outputs + or the output of the job.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OutputCommitter describes the commit of task output for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputCommitter of + the job to:

+

    +
  1. + Setup the job during initialization. For example, create the temporary + output directory for the job during the initialization of the job. +
  2. +
  3. + Cleanup the job after the job completion. For example, remove the + temporary output directory after the job completion. +
  4. +
  5. + Setup the task temporary output. +
  6. +
  7. + Check whether a task needs a commit. This is to avoid the commit + procedure if a task does not need commit. +
  8. +
  9. + Commit of the task output. +
  10. +
  11. + Discard the task commit. +
  12. +
+ + @see FileOutputCommitter + @see JobContext + @see TaskAttemptContext + @deprecated Use {@link org.apache.hadoop.mapreduce.OutputCommitter} instead.]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param ignored + @param job job configuration. + @throws IOException when output should not be attempted]]> +
+
+ + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter + @see JobConf + @deprecated Use {@link org.apache.hadoop.mapreduce.OutputFormat} instead.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be paritioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer + @deprecated Use {@link org.apache.hadoop.mapreduce.Partitioner} instead.]]> +
+
+ + + + + + + + + + + + + + + + + + + true if there exists a key/value, + false otherwise. + @throws IOException]]> + + + + + + + + + + + + + + + RawKeyValueIterator is an iterator used to iterate over + the raw keys and values during sort/merge of intermediate data.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0.0 to 1.0. + @throws IOException]]> + + + + RecordReader reads <key, value> pairs from an + {@link InputSplit}. + +

RecordReader, typically, converts the byte-oriented view of + the input, provided by the InputSplit, and presents a + record-oriented view for the {@link Mapper} & {@link Reducer} tasks for + processing. It thus assumes the responsibility of processing record + boundaries and presenting the tasks with keys and values.

+ + @see InputSplit + @see InputFormat]]> +
+
+ + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param reporter facility to report progress. + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + Reduces values for a given key. + +

The framework calls this method for each + <key, (list of values)> pair in the grouped inputs. + Output values must be of the same type as input values. Input keys must + not be altered. The framework will reuse the key and value objects + that are passed into the reduce, therefore the application should clone + the objects they want to keep a copy of. In many cases, all values are + combined into zero or one value. +

+ +

Output pairs are collected with calls to + {@link OutputCollector#collect(Object,Object)}.

+ +

Applications can use the {@link Reporter} provided to report progress + or just indicate that they are alive. In scenarios where the application + takes an insignificant amount of time to process individual key/value + pairs, this is crucial since the framework might assume that the task has + timed-out and kill that task. The other way of avoiding this is to set + + mapred.task.timeout to a high-enough value (or even zero for no + time-outs).

+ + @param key the key. + @param values the list of values to reduce. + @param output to collect keys and combined values. + @param reporter facility to report progress.]]> +
+ + + The number of Reducers for the job is set by the user via + {@link JobConf#setNumReduceTasks(int)}. Reducer implementations + can access the {@link JobConf} for the job via the + {@link JobConfigurable#configure(JobConf)} method and initialize themselves. + Similarly they can use the {@link Closeable#close()} method for + de-initialization.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    Reducer is input the grouped output of a {@link Mapper}. + In the phase the framework, for each Reducer, fetches the + relevant partition of the output of all the Mappers, via HTTP. +

    +
  2. + +
  3. +

    Sort

    + +

    The framework groups Reducer inputs by keys + (since different Mappers may have output the same key) in this + stage.

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    If equivalence rules for keys while grouping the intermediates are + different from those for grouping keys before reduction, then one may + specify a Comparator via + {@link JobConf#setOutputValueGroupingComparator(Class)}.Since + {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to + control how intermediate keys are grouped, these can be used in conjunction + to simulate secondary sort on values.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterator, OutputCollector, Reporter)} + method is called for each <key, (list of values)> pair in + the grouped inputs.

    +

    The output of the reduce task is typically written to the + {@link FileSystem} via + {@link OutputCollector#collect(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+     public class MyReducer<K extends WritableComparable, V extends Writable> 
+     extends MapReduceBase implements Reducer<K, V, K, V> {
+     
+       static enum MyCounters { NUM_RECORDS }
+        
+       private String reduceTaskId;
+       private int noKeys = 0;
+       
+       public void configure(JobConf job) {
+         reduceTaskId = job.get("mapred.task.id");
+       }
+       
+       public void reduce(K key, Iterator<V> values,
+                          OutputCollector<K, V> output, 
+                          Reporter reporter)
+       throws IOException {
+       
+         // Process
+         int noValues = 0;
+         while (values.hasNext()) {
+           V value = values.next();
+           
+           // Increment the no. of values for this key
+           ++noValues;
+           
+           // Process the <key, value> pair (assume this takes a while)
+           // ...
+           // ...
+           
+           // Let the framework know that we are alive, and kicking!
+           if ((noValues%10) == 0) {
+             reporter.progress();
+           }
+         
+           // Process some more
+           // ...
+           // ...
+           
+           // Output the <key, value> 
+           output.collect(key, value);
+         }
+         
+         // Increment the no. of <key, list of values> pairs processed
+         ++noKeys;
+         
+         // Increment counters
+         reporter.incrCounter(NUM_RECORDS, 1);
+         
+         // Every 100 keys update application-level status
+         if ((noKeys%100) == 0) {
+           reporter.setStatus(reduceTaskId + " processed " + noKeys);
+         }
+       }
+     }
+ 

+ + @see Mapper + @see Partitioner + @see Reporter + @see MapReduceBase + @deprecated Use {@link org.apache.hadoop.mapreduce.Reducer} instead.]]> +
+
+ + + + + + + + + + + + + + Counter of the given group/name.]]> + + + + + + + Counter of the given group/name.]]> + + + + + + + Enum. + @param amount A non-negative amount by which the counter is to + be incremented.]]> + + + + + + + + + + + + + + InputSplit that the map is reading from. + @throws UnsupportedOperationException if called outside a mapper]]> + + + + + + + + + {@link Mapper} and {@link Reducer} can use the Reporter + provided to report progress or just indicate that they are alive. In + scenarios where the application takes an insignificant amount of time to + process individual key/value pairs, this is crucial since the framework + might assume that the task has timed-out and kill that task. + +

Applications can also update {@link Counters} via the provided + Reporter .

+ + @see Progressable + @see Counters]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + progress of the job's cleanup-tasks, as a float between 0.0 + and 1.0. When all cleanup tasks have completed, the function returns 1.0. + + @return the progress of the job's cleanup-tasks. + @throws IOException]]> + + + + + + progress of the job's setup-tasks, as a float between 0.0 + and 1.0. When all setup tasks have completed, the function returns 1.0. + + @return the progress of the job's setup-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RunningJob is the user-interface to query for details on a + running Map-Reduce job. + +

Clients can get hold of RunningJob via the {@link JobClient} + and then query the running-job for details such as name, configuration, + progress etc.

+ + @see JobClient]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This allows the user to specify the key class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + This allows the user to specify the value class to be different + from the actual class ({@link BytesWritable}) used for writing

+ + @param conf the {@link JobConf} to modify + @param theClass the SequenceFile output key class.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + f. The filtering criteria is + MD5(key) % f == 0.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + f using + the criteria record# % f == 0. + For example, if the frequency is 10, one out of 10 records is returned.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if auto increment + {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}. + false otherwise.]]> + + + + + + + + + + + + + true if auto increment + {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}. + false otherwise.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hadoop provides an optional mode of execution in which the bad records + are detected and skipped in further attempts. + +

This feature can be used when map/reduce tasks crashes deterministically on + certain input. This happens due to bugs in the map/reduce function. The usual + course would be to fix these bugs. But sometimes this is not possible; + perhaps the bug is in third party libraries for which the source code is + not available. Due to this, the task never reaches to completion even with + multiple attempts and complete data for that task is lost.

+ +

With this feature, only a small portion of data is lost surrounding + the bad record, which may be acceptable for some user applications. + see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}

+ +

The skipping mode gets kicked off after certain no of failures + see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}

+ +

In the skipping mode, the map/reduce task maintains the record range which + is getting processed at all times. Before giving the input to the + map/reduce function, it sends this record range to the Task tracker. + If task crashes, the Task tracker knows which one was the last reported + range. On further attempts that range get skipped.

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all task attempt IDs + of any jobtracker, in any job, of the first + map task, we would use : +
 
+ TaskAttemptID.getTaskAttemptIDsPattern(null, null, true, 1, null);
+ 
+ which will return : +
 "attempt_[^_]*_[0-9]*_m_000001_[0-9]*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @param attemptId the task attempt number, or null + @return a regex pattern matching TaskAttemptIDs]]> +
+
+ + + An example TaskAttemptID is : + attempt_200707121733_0003_m_000005_0 , which represents the + zeroth task attempt for the fifth map task in the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse TaskAttemptID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the first map task + of any jobtracker, of any job, we would use : +

 
+ TaskID.getTaskIDsPattern(null, null, true, 1);
+ 
+ which will return : +
 "task_[^_]*_[0-9]*_m_000001*" 
+ @param jtIdentifier jobTracker identifier, or null + @param jobId job number, or null + @param isMap whether the tip is a map, or null + @param taskId taskId number, or null + @return a regex pattern matching TaskIDs]]> +
+ + + + + + + + An example TaskID is : + task_200707121733_0003_m_000005 , which represents the + fifth map task in the third job running at the jobtracker + started at 200707121733. +

+ Applications should never construct or parse TaskID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + hadoop.log.dir.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the Job was added.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ([,]*) + func ::= tbl(,"") + class ::= @see java.lang.Class#forName(java.lang.String) + path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) + } + Reads expression from the mapred.join.expr property and + user-supplied join types from mapred.join.define.<ident> + types. Paths supplied to tbl are given as input paths to the + InputFormat class listed. + @see #compose(java.lang.String, java.lang.Class, java.lang.String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ,

) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + + + + + (tbl(,),tbl(,),...,tbl(,)) }]]> + + + + mapred.join.define.<ident> to a classname. In the expression + mapred.join.expr, the identifier will be assumed to be a + ComposableRecordReader. + mapred.join.keycomparator can be a classname used to compare keys + in the join. + @see JoinRecordReader + @see MultiFilterRecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + capacity children to position + id in the parent reader. + The id of a root CompositeRecordReader is -1 by convention, but relying + on this is not recommended.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + override(S1,S2,S3) will prefer values + from S3 over S2, and values from S2 over S1 for all keys + emitted from all sources.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [,,...,]]]> + + + + + + + out. + TupleWritable format: + {@code + ...... + }]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Mapper leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Mapper does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Mapper the configuration given for it, + mapperConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain +

+ + @param job job's JobConf to add the Mapper class. + @param klass the Mapper class to add. + @param inputKeyClass mapper input key class. + @param inputValueClass mapper input value class. + @param outputKeyClass mapper output key class. + @param outputValueClass mapper output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param mapperConf a JobConf with the configuration for the Mapper + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + If this method is overriden super.configure(...) should be + invoked at the beginning of the overwriter method.]]> + + + + + + + + + + map(...) methods of the Mappers in the chain.]]> + + + + + + + If this method is overriden super.close() should be + invoked at the end of the overwriter method.]]> + + + + + The Mapper classes are invoked in a chained (or piped) fashion, the output of + the first becomes the input of the second, and so on until the last Mapper, + the output of the last Mapper will be written to the task's output. +

+ The key functionality of this feature is that the Mappers in the chain do not + need to be aware that they are executed in a chain. This enables having + reusable specialized Mappers that can be combined to perform composite + operations within a single task. +

+ Special care has to be taken when creating chains that the key/values output + by a Mapper are valid for the following Mapper in the chain. It is assumed + all Mappers and the Reduce in the chain use maching output and input key and + value classes as no conversion is done by the chaining code. +

+ Using the ChainMapper and the ChainReducer classes is possible to compose + Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And + immediate benefit of this pattern is a dramatic reduction in disk IO. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain. +

+ ChainMapper usage pattern: +

+

+ ...
+ conf.setJobName("chain");
+ conf.setInputFormat(TextInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+ 

+ JobConf mapAConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, AMap.class, LongWritable.class, Text.class, + Text.class, Text.class, true, mapAConf); +

+ JobConf mapBConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, BMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, mapBConf); +

+ JobConf reduceConf = new JobConf(false); + ... + ChainReducer.setReducer(conf, XReduce.class, LongWritable.class, Text.class, + Text.class, Text.class, true, reduceConf); +

+ ChainReducer.addMapper(conf, CMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, null); +

+ ChainReducer.addMapper(conf, DMap.class, LongWritable.class, Text.class, + LongWritable.class, LongWritable.class, true, null); +

+ FileInputFormat.setInputPaths(conf, inDir); + FileOutputFormat.setOutputPath(conf, outDir); + ... +

+ JobClient jc = new JobClient(conf); + RunningJob job = jc.submitJob(conf); + ... +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Reducer leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Reducer does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Reducer the configuration given for it, + reducerConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainReducer, this is done by the setReducer or the addMapper for the last + element in the chain. + + @param job job's JobConf to add the Reducer class. + @param klass the Reducer class to add. + @param inputKeyClass reducer input key class. + @param inputValueClass reducer input value class. + @param outputKeyClass reducer output key class. + @param outputValueClass reducer output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param reducerConf a JobConf with the configuration for the Reducer + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + + + + + + + + It has to be specified how key and values are passed from one element of + the chain to the next, by value or by reference. If a Mapper leverages the + assumed semantics that the key and values are not modified by the collector + 'by value' must be used. If the Mapper does not expect this semantics, as + an optimization to avoid serialization and deserialization 'by reference' + can be used. +

+ For the added Mapper the configuration given for it, + mapperConf, have precedence over the job's JobConf. This + precedence is in effect when the task is running. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainMapper, this is done by the addMapper for the last mapper in the chain + . + + @param job chain job's JobConf to add the Mapper class. + @param klass the Mapper class to add. + @param inputKeyClass mapper input key class. + @param inputValueClass mapper input value class. + @param outputKeyClass mapper output key class. + @param outputValueClass mapper output value class. + @param byValue indicates if key/values should be passed by value + to the next Mapper in the chain, if any. + @param mapperConf a JobConf with the configuration for the Mapper + class. It is recommended to use a JobConf without default values using the + JobConf(boolean loadDefaults) constructor with FALSE.]]> + + + + + + + If this method is overriden super.configure(...) should be + invoked at the beginning of the overwriter method.]]> + + + + + + + + + + reduce(...) method of the Reducer with the + map(...) methods of the Mappers in the chain.]]> + + + + + + + If this method is overriden super.close() should be + invoked at the end of the overwriter method.]]> + + + + + For each record output by the Reducer, the Mapper classes are invoked in a + chained (or piped) fashion, the output of the first becomes the input of the + second, and so on until the last Mapper, the output of the last Mapper will + be written to the task's output. +

+ The key functionality of this feature is that the Mappers in the chain do not + need to be aware that they are executed after the Reducer or in a chain. + This enables having reusable specialized Mappers that can be combined to + perform composite operations within a single task. +

+ Special care has to be taken when creating chains that the key/values output + by a Mapper are valid for the following Mapper in the chain. It is assumed + all Mappers and the Reduce in the chain use maching output and input key and + value classes as no conversion is done by the chaining code. +

+ Using the ChainMapper and the ChainReducer classes is possible to compose + Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And + immediate benefit of this pattern is a dramatic reduction in disk IO. +

+ IMPORTANT: There is no need to specify the output key/value classes for the + ChainReducer, this is done by the setReducer or the addMapper for the last + element in the chain. +

+ ChainReducer usage pattern: +

+

+ ...
+ conf.setJobName("chain");
+ conf.setInputFormat(TextInputFormat.class);
+ conf.setOutputFormat(TextOutputFormat.class);
+ 

+ JobConf mapAConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, AMap.class, LongWritable.class, Text.class, + Text.class, Text.class, true, mapAConf); +

+ JobConf mapBConf = new JobConf(false); + ... + ChainMapper.addMapper(conf, BMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, mapBConf); +

+ JobConf reduceConf = new JobConf(false); + ... + ChainReducer.setReducer(conf, XReduce.class, LongWritable.class, Text.class, + Text.class, Text.class, true, reduceConf); +

+ ChainReducer.addMapper(conf, CMap.class, Text.class, Text.class, + LongWritable.class, Text.class, false, null); +

+ ChainReducer.addMapper(conf, DMap.class, LongWritable.class, Text.class, + LongWritable.class, LongWritable.class, true, null); +

+ FileInputFormat.setInputPaths(conf, inDir); + FileOutputFormat.setOutputPath(conf, outDir); + ... +

+ JobClient jc = new JobClient(conf); + RunningJob job = jc.submitJob(conf); + ... +

]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + RecordReader's for CombineFileSplit's. + @see CombineFileSplit]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + th Path]]> + + + + + + th Path]]> + + + + + + + + + + + th Path]]> + + + + + + + + + + + + + + + + + + + + + + + + + + CombineFileSplit can be used to implement {@link org.apache.hadoop.mapred.RecordReader}'s, + with reading one record per file. + @see org.apache.hadoop.mapred.FileSplit + @see CombineFileInputFormat]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + @param freq The frequency with which records will be emitted.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + This will read every split at the client, which is very expensive. + @param freq Probability with which a key will be chosen. + @param numSamples Total number of samples to obtain from all selected + splits.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + all splits. + Takes the first numSamples / numSplits records from each split. + @param numSamples Total number of samples to obtain from all selected + splits.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the name output is multi, false + if it is single. If the name output is not defined it returns + false]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @param conf job conf to add the named output + @param namedOutput named output name, it has to be a word, letters + and numbers only, cannot be the word 'part' as + that is reserved for the + default output. + @param outputFormatClass OutputFormat class. + @param keyClass key class + @param valueClass value class]]> + + + + + + + + + + + + @param conf job conf to add the named output + @param namedOutput named output name, it has to be a word, letters + and numbers only, cannot be the word 'part' as + that is reserved for the + default output. + @param outputFormatClass OutputFormat class. + @param keyClass key class + @param valueClass value class]]> + + + + + + + + By default these counters are disabled. +

+ MultipleOutputs supports counters, by default the are disabled. + The counters group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. + + @param conf job conf to enableadd the named output. + @param enabled indicates if the counters will be enabled or not.]]> +
+
+ + + + + By default these counters are disabled. +

+ MultipleOutputs supports counters, by default the are disabled. + The counters group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. + + + @param conf job conf to enableadd the named output. + @return TRUE if the counters are enabled, FALSE if they are disabled.]]> +
+
+ + + + + + + + + + + + + @param namedOutput the named output name + @param reporter the reporter + @return the output collector for the given named output + @throws IOException thrown if output collector could not be created]]> + + + + + + + + + + + @param namedOutput the named output name + @param multiName the multi name part + @param reporter the reporter + @return the output collector for the given named output + @throws IOException thrown if output collector could not be created]]> + + + + + + + If overriden subclasses must invoke super.close() at the + end of their close() + + @throws java.io.IOException thrown if any of the MultipleOutput files + could not be closed properly.]]> + + + + OutputCollector passed to + the map() and reduce() methods of the + Mapper and Reducer implementations. +

+ Each additional output, or named output, may be configured with its own + OutputFormat, with its own key class and with its own value + class. +

+ A named output can be a single file or a multi file. The later is refered as + a multi named output. +

+ A multi named output is an unbound set of files all sharing the same + OutputFormat, key class and value class configuration. +

+ When named outputs are used within a Mapper implementation, + key/values written to a name output are not part of the reduce phase, only + key/values written to the job OutputCollector are part of the + reduce phase. +

+ MultipleOutputs supports counters, by default the are disabled. The counters + group is the {@link MultipleOutputs} class name. +

+ The names of the counters are the same as the named outputs. For multi + named outputs the name of the counter is the concatenation of the named + output, and underscore '_' and the multiname. +

+ Job configuration usage pattern is: +

+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+   SequenceFileOutputFormat.class,
+   LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+ 
+

+ Job configuration usage pattern is: +

+
+ public class MOReduce implements
+   Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+ 
]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Map implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured to use this MapRunnable class (using + the JobConf.setMapRunnerClass method) and + the number of thread the thread-pool can use with the + mapred.map.multithreadedrunner.threads property, its default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pairs. Uses + {@link StringTokenizer} to break text into tokens. + @deprecated Use + {@link org.apache.hadoop.mapreduce.lib.map.TokenCounterMapper} instead.]]> + + + + + + + + + + + + total.order.partitioner.natural.order is not false, a trie + of the first total.order.partitioner.max.trie.depth(2) + 1 bytes + will be built. Otherwise, keys will be located using a binary search of + the partition keyset using the {@link org.apache.hadoop.io.RawComparator} + defined for this job. The input file must be sorted with the same + comparator and contain {@link + org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.]]> + + + + + + + + + + + + R reduces, there are R-1 + keys in the SequenceFile.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + generateKeyValPairs(Object key, Object value); public void + configure(JobConfjob); } + + The package also provides a base class, ValueAggregatorBaseDescriptor, + implementing the above interface. The user can extend the base class and + implement generateKeyValPairs accordingly. + + The primary work of generateKeyValPairs is to emit one or more key/value + pairs based on the input key/value pair. The key in an output key/value pair + encode two pieces of information: aggregation type and aggregation id. The + value will be aggregated onto the aggregation id according the aggregation + type. + + This class offers a function to generate a map/reduce job using Aggregate + framework. The function takes the following parameters: input directory spec + input format (text or sequence file) output directory a file specifying the + user plugin class]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The job can be configured using the static methods in this class, + {@link DBInputFormat}, and {@link DBOutputFormat}. +

+ Alternatively, the properties can be set in the configuration with proper + values. + + @see DBConfiguration#configureDB(JobConf, String, String, String, String) + @see DBInputFormat#setInput(JobConf, Class, String, String) + @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...) + @see DBOutputFormat#setOutput(JobConf, String, String...)]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 20070101 AND length > 0)' + @param orderBy the fieldNames in the orderBy clause. + @param fieldNames The field names in the table + @see #setInput(JobConf, Class, String, String)]]> + + + + + + + + + + + + + + DBInputFormat emits LongWritables containing the record number as + key and DBWritables as value. + + The SQL query, and input class can be using one of the two + setInput methods.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {@link DBOutputFormat} accepts <key,value> pairs, where + key has a type extending DBWritable. Returned {@link RecordWriter} + writes only the key to the database with a batch SQL query.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + DBWritable. DBWritable, is similar to {@link Writable} + except that the {@link #write(PreparedStatement)} method takes a + {@link PreparedStatement}, and {@link #readFields(ResultSet)} + takes a {@link ResultSet}. +

+ Implementations are responsible for writing the fields of the object + to PreparedStatement, and reading the fields of the object from the + ResultSet. + +

Example:

+ If we have the following table in the database : +
+ CREATE TABLE MyTable (
+   counter        INTEGER NOT NULL,
+   timestamp      BIGINT  NOT NULL,
+ );
+ 
+ then we can read/write the tuples from/to the table with : +

+ public class MyWritable implements Writable, DBWritable {
+   // Some data     
+   private int counter;
+   private long timestamp;
+       
+   //Writable#write() implementation
+   public void write(DataOutput out) throws IOException {
+     out.writeInt(counter);
+     out.writeLong(timestamp);
+   }
+       
+   //Writable#readFields() implementation
+   public void readFields(DataInput in) throws IOException {
+     counter = in.readInt();
+     timestamp = in.readLong();
+   }
+       
+   public void write(PreparedStatement statement) throws SQLException {
+     statement.setInt(1, counter);
+     statement.setLong(2, timestamp);
+   }
+       
+   public void readFields(ResultSet resultSet) throws SQLException {
+     counter = resultSet.getInt(1);
+     timestamp = resultSet.getLong(2);
+   } 
+ }
+ 

]]> +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Counters represent global counters, defined either by the + Map-Reduce framework or applications. Each Counter is named by + an {@link Enum} and has a long for the value.

+ +

Counters are bunched into Groups, each comprising of + counters from a particular Enum class.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Each {@link InputSplit} is then assigned to an individual {@link Mapper} + for processing.

+ +

Note: The split is a logical split of the inputs and the + input files are not physically split into chunks. For e.g. a split could + be <input-file-path, start, offset> tuple. The InputFormat + also creates the {@link RecordReader} to read the {@link InputSplit}. + + @param context job configuration. + @return an array of {@link InputSplit}s for the job.]]> + + + + + + + + + + + + + InputFormat describes the input-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the InputFormat of the + job to:

+

    +
  1. + Validate the input-specification of the job. +
  2. + Split-up the input file(s) into logical {@link InputSplit}s, each of + which is then assigned to an individual {@link Mapper}. +
  3. +
  4. + Provide the {@link RecordReader} implementation to be used to glean + input records from the logical InputSplit for processing by + the {@link Mapper}. +
  5. +
+ +

The default behavior of file-based {@link InputFormat}s, typically + sub-classes of {@link FileInputFormat}, is to split the + input into logical {@link InputSplit}s based on the total size, in + bytes, of the input files. However, the {@link FileSystem} blocksize of + the input files is treated as an upper bound for input splits. A lower bound + on the split size can be set via + + mapred.min.split.size.

+ +

Clearly, logical splits based on input-size is insufficient for many + applications since record boundaries are to respected. In such cases, the + application has to also implement a {@link RecordReader} on whom lies the + responsibility to respect record-boundaries and present a record-oriented + view of the logical InputSplit to the individual task. + + @see InputSplit + @see RecordReader + @see FileInputFormat]]> + + + + + + + + + + + + + + + + + + + + + + + InputSplit represents the data to be processed by an + individual {@link Mapper}. + +

Typically, it presents a byte-oriented view on the input and is the + responsibility of {@link RecordReader} of the job to process this and present + a record-oriented view. + + @see InputFormat + @see RecordReader]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + InputFormat to use + @throws IllegalStateException if the job is submitted]]> + + + + + + + OutputFormat to use + @throws IllegalStateException if the job is submitted]]> + + + + + + + Mapper to use + @throws IllegalStateException if the job is submitted]]> + + + + + + + + + + + + + + + + + + + + + + + + + Reducer to use + @throws IllegalStateException if the job is submitted]]> + + + + + + + Partitioner to use + @throws IllegalStateException if the job is submitted]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + progress of the job's map-tasks, as a float between 0.0 + and 1.0. When all map tasks have completed, the function returns 1.0. + + @return the progress of the job's map-tasks. + @throws IOException]]> + + + + + + progress of the job's reduce-tasks, as a float between 0.0 + and 1.0. When all reduce tasks have completed, the function returns 1.0. + + @return the progress of the job's reduce-tasks. + @throws IOException]]> + + + + + + true if the job is complete, else false. + @throws IOException]]> + + + + + + true if the job succeeded, else false. + @throws IOException]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JobTracker is lost]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1. + @return the number of reduce tasks for this job.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + An example JobID is : + job_200707121733_0003 , which represents the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse JobID strings, but rather + use appropriate constructors or {@link #forName(String)} method. + + @see TaskID + @see TaskAttemptID + @see org.apache.hadoop.mapred.JobTracker#getNewJobId() + @see org.apache.hadoop.mapred.JobTracker#getStartTime()]]> + + + + + + + + + + + + + + + + + + + + + + + + + + the key input type to the Mapper + @param the value input type to the Mapper + @param the key output type from the Mapper + @param the value output type from the Mapper]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Maps are the individual tasks which transform input records into a + intermediate records. The transformed intermediate records need not be of + the same type as the input records. A given input pair may map to zero or + many output pairs.

+ +

The Hadoop Map-Reduce framework spawns one map task for each + {@link InputSplit} generated by the {@link InputFormat} for the job. + Mapper implementations can access the {@link Configuration} for + the job via the {@link JobContext#getConfiguration()}. + +

The framework first calls + {@link #setup(org.apache.hadoop.mapreduce.Mapper.Context)}, followed by + {@link #map(Object, Object, Context)} + for each key/value pair in the InputSplit. Finally + {@link #cleanup(Context)} is called.

+ +

All intermediate values associated with a given output key are + subsequently grouped by the framework, and passed to a {@link Reducer} to + determine the final output. Users can control the sorting and grouping by + specifying two key {@link RawComparator} classes.

+ +

The Mapper outputs are partitioned per + Reducer. Users can control which keys (and hence records) go to + which Reducer by implementing a custom {@link Partitioner}. + +

Users can optionally specify a combiner, via + {@link Job#setCombinerClass(Class)}, to perform local aggregation of the + intermediate outputs, which helps to cut down the amount of data transferred + from the Mapper to the Reducer. + +

Applications can specify if and how the intermediate + outputs are to be compressed and which {@link CompressionCodec}s are to be + used via the Configuration.

+ +

If the job has zero + reduces then the output of the Mapper is directly written + to the {@link OutputFormat} without sorting by keys.

+ +

Example:

+

+ public class TokenCounterMapper 
+     extends Mapper{
+    
+   private final static IntWritable one = new IntWritable(1);
+   private Text word = new Text();
+   
+   public void map(Object key, Text value, Context context) throws IOException {
+     StringTokenizer itr = new StringTokenizer(value.toString());
+     while (itr.hasMoreTokens()) {
+       word.set(itr.nextToken());
+       context.collect(word, one);
+     }
+   }
+ }
+ 

+ +

Applications may override the {@link #run(Context)} method to exert + greater control on map processing e.g. multi-threaded Mappers + etc.

+ + @see InputFormat + @see JobContext + @see Partitioner + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OutputCommitter describes the commit of task output for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputCommitter of + the job to:

+

    +
  1. + Setup the job during initialization. For example, create the temporary + output directory for the job during the initialization of the job. +
  2. +
  3. + Cleanup the job after the job completion. For example, remove the + temporary output directory after the job completion. +
  4. +
  5. + Setup the task temporary output. +
  6. +
  7. + Check whether a task needs a commit. This is to avoid the commit + procedure if a task does not need commit. +
  8. +
  9. + Commit of the task output. +
  10. +
  11. + Discard the task commit. +
  12. +
+ + @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter + @see JobContext + @see TaskAttemptContext]]> +
+
+ + + + + + + + + + + + + + + + + + + This is to validate the output specification for the job when it is + a job is submitted. Typically checks that it does not already exist, + throwing an exception when it already exists, so that output is not + overwritten.

+ + @param context information about the job + @throws IOException when output should not be attempted]]> +
+
+ + + + + + + + + + OutputFormat describes the output-specification for a + Map-Reduce job. + +

The Map-Reduce framework relies on the OutputFormat of the + job to:

+

    +
  1. + Validate the output-specification of the job. For e.g. check that the + output directory doesn't already exist. +
  2. + Provide the {@link RecordWriter} implementation to be used to write out + the output files of the job. Output files are stored in a + {@link FileSystem}. +
  3. +
+ + @see RecordWriter]]> +
+
+ + + + + + + + + + + Typically a hash function on a all or a subset of the key.

+ + @param key the key to be partioned. + @param value the entry value. + @param numPartitions the total number of partitions. + @return the partition number for the key.]]> +
+
+ + Partitioner controls the partitioning of the keys of the + intermediate map-outputs. The key (or a subset of the key) is used to derive + the partition, typically by a hash function. The total number of partitions + is the same as the number of reduce tasks for the job. Hence this controls + which of the m reduce tasks the intermediate key (and hence the + record) is sent for reduction.

+ + @see Reducer]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @param ]]> + + + + + + + + + + + + + + + + + + + + + + RecordWriter to future operations. + + @param context the context of the task + @throws IOException]]> + + + + RecordWriter writes the output <key, value> pairs + to an output file. + +

RecordWriter implementations write the job outputs to the + {@link FileSystem}. + + @see OutputFormat]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the class of the input keys + @param the class of the input values + @param the class of the output keys + @param the class of the output values]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Reducer implementations + can access the {@link Configuration} for the job via the + {@link JobContext#getConfiguration()} method.

+ +

Reducer has 3 primary phases:

+
    +
  1. + +

    Shuffle

    + +

    The Reducer copies the sorted output from each + {@link Mapper} using HTTP across the network.

    +
  2. + +
  3. +

    Sort

    + +

    The framework merge sorts Reducer inputs by + keys + (since different Mappers may have output the same key).

    + +

    The shuffle and sort phases occur simultaneously i.e. while outputs are + being fetched they are merged.

    + +
    SecondarySort
    + +

    To achieve a secondary sort on the values returned by the value + iterator, the application should extend the key with the secondary + key and define a grouping comparator. The keys will be sorted using the + entire key, but will be grouped using the grouping comparator to decide + which keys and values are sent in the same call to reduce.The grouping + comparator is specified via + {@link Job#setGroupingComparatorClass(Class)}. The sort order is + controlled by + {@link Job#setSortComparatorClass(Class)}.

    + + + For example, say that you want to find duplicate web pages and tag them + all with the url of the "best" known example. You would set up the job + like: +
      +
    • Map Input Key: url
    • +
    • Map Input Value: document
    • +
    • Map Output Key: document checksum, url pagerank
    • +
    • Map Output Value: url
    • +
    • Partitioner: by checksum
    • +
    • OutputKeyComparator: by checksum and then decreasing pagerank
    • +
    • OutputValueGroupingComparator: by checksum
    • +
    +
  4. + +
  5. +

    Reduce

    + +

    In this phase the + {@link #reduce(Object, Iterable, Context)} + method is called for each <key, (collection of values)> in + the sorted inputs.

    +

    The output of the reduce task is typically written to a + {@link RecordWriter} via + {@link Context#write(Object, Object)}.

    +
  6. +
+ +

The output of the Reducer is not re-sorted.

+ +

Example:

+

+ public class IntSumReducer extends Reducer {
+   private IntWritable result = new IntWritable();
+ 
+   public void reduce(Key key, Iterable values, 
+                      Context context) throws IOException {
+     int sum = 0;
+     for (IntWritable val : values) {
+       sum += val.get();
+     }
+     result.set(sum);
+     context.collect(key, result);
+   }
+ }
+ 

+ + @see Mapper + @see Partitioner]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + An example TaskAttemptID is : + attempt_200707121733_0003_m_000005_0 , which represents the + zeroth task attempt for the fifth map task in the third job + running at the jobtracker started at 200707121733. +

+ Applications should never construct or parse TaskAttemptID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + An example TaskID is : + task_200707121733_0003_m_000005 , which represents the + fifth map task in the third job running at the jobtracker + started at 200707121733. +

+ Applications should never construct or parse TaskID strings + , but rather use appropriate constructors or {@link #forName(String)} + method. + + @see JobID + @see TaskAttemptID]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the input key type for the task + @param the input value type for the task + @param the output key type for the task + @param the output value type for the task]]> + + + + + + + + + + + + + + + + + + + FileInputFormat implementations can override this and return + false to ensure that individual input files are never split-up + so that {@link Mapper}s process entire files. + + @param context the job context + @param filename the file name to check + @return is this file splitable?]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FileInputFormat is the base class for all file-based + InputFormats. This provides a generic implementation of + {@link #getSplits(JobContext)}. + Subclasses of FileInputFormat can also override the + {@link #isSplitable(JobContext, Path)} method to ensure input-files are + not split-up and are processed as a whole by {@link Mapper}s.]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the map's input key type + @param the map's input value type + @param the map's output key type + @param the map's output value type + @param job the job + @return the mapper class to run]]> + + + + + + + the map input key type + @param the map input value type + @param the map output key type + @param the map output value type + @param job the job to modify + @param cls the class to use as the mapper]]> + + + + + + + + + + + + + It can be used instead of the default implementation, + @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU + bound in order to improve throughput. +

+ Mapper implementations using this MapRunnable must be thread-safe. +

+ The Map-Reduce job has to be configured with the mapper to use via + {@link #setMapperClass(Configuration, Class)} and + the number of thread the thread-pool can use with the + {@link #getNumberOfThreads(Configuration) method. The default + value is 10 threads. +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true if the job output should be compressed, + false otherwise]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tasks' Side-Effect Files + +

Some applications need to create/write-to side-files, which differ from + the actual job-outputs. + +

In such cases there could be issues with 2 instances of the same TIP + (running simultaneously e.g. speculative tasks) trying to open/write-to the + same file (path) on HDFS. Hence the application-writer will have to pick + unique names per task-attempt (e.g. using the attemptid, say + attempt_200709221812_0001_m_000000_0), not just per TIP.

+ +

To get around this the Map-Reduce framework helps the application-writer + out by maintaining a special + ${mapred.output.dir}/_temporary/_${taskid} + sub-directory for each task-attempt on HDFS where the output of the + task-attempt goes. On successful completion of the task-attempt the files + in the ${mapred.output.dir}/_temporary/_${taskid} (only) + are promoted to ${mapred.output.dir}. Of course, the + framework discards the sub-directory of unsuccessful task-attempts. This + is completely transparent to the application.

+ +

The application-writer can take advantage of this by creating any + side-files required in a work directory during execution + of his task i.e. via + {@link #getWorkOutputPath(TaskInputOutputContext)}, and + the framework will move them out similarly - thus she doesn't have to pick + unique paths per task-attempt.

+ +

The entire discussion holds true for maps of jobs with + reducer=NONE (i.e. 0 reduces) since output of the map, in that case, + goes directly to HDFS.

+ + @return the {@link Path} to the task's temporary output directory + for the map-reduce job.]]> +
+ + + + + + + + + The path can be used to create custom files from within the map and + reduce tasks. The path name will be unique for each task. The path parent + will be the job output directory.

ls + +

This method uses the {@link #getUniqueFile} method to make the file name + unique for the task.

+ + @param context the context for the task. + @param name the name for the file. + @param extension the extension for the file + @return a unique path accross all tasks of the job.]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool supports archiving and anaylzing (sort/grep) of log-files. + It takes as input + a) Input uri which will serve uris of the logs to be archived. + b) Output directory (not mandatory). + b) Directory on dfs to archive the logs. + c) The sort/grep patterns for analyzing the files and separator for boundaries. + Usage: + Logalyzer -archive -archiveDir -analysis -logs -grep -sort -separator +

]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/kfs-0.2.2.jar b/lib/kfs-0.2.2.jar new file mode 100644 index 00000000000..aa32e74baf2 Binary files /dev/null and b/lib/kfs-0.2.2.jar differ diff --git a/lib/kfs-0.2.LICENSE.txt b/lib/kfs-0.2.LICENSE.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/lib/kfs-0.2.LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/test/org/apache/hadoop/cli/TestCLI.java b/src/test/org/apache/hadoop/cli/TestCLI.java new file mode 100644 index 00000000000..306733cefd2 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/TestCLI.java @@ -0,0 +1,450 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli; + +import java.io.File; +import java.util.ArrayList; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import junit.framework.TestCase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.cli.util.CLITestData; +import org.apache.hadoop.cli.util.CommandExecutor; +import org.apache.hadoop.cli.util.ComparatorBase; +import org.apache.hadoop.cli.util.ComparatorData; +import org.apache.hadoop.cli.util.CLITestData.TestCmd; +import org.apache.hadoop.cli.util.CLITestData.TestCmd.CommandType; +import org.apache.hadoop.cli.util.CommandExecutor.Result; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.security.authorize.ServiceAuthorizationManager; +import org.apache.hadoop.util.StringUtils; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Tests for the Command Line Interface (CLI) + */ +public class TestCLI extends TestCase { + private static final Log LOG = + LogFactory.getLog(TestCLI.class.getName()); + + // In this mode, it runs the command and compares the actual output + // with the expected output + public static final String TESTMODE_TEST = "test"; // Run the tests + + // If it is set to nocompare, run the command and do not compare. + // This can be useful populate the testConfig.xml file the first time + // a new command is added + public static final String TESTMODE_NOCOMPARE = "nocompare"; + public static final String TEST_CACHE_DATA_DIR = + System.getProperty("test.cache.data", "build/test/cache"); + + //By default, run the tests. The other mode is to run the commands and not + // compare the output + protected String testMode = TESTMODE_TEST; + + // Storage for tests read in from the config file + protected ArrayList testsFromConfigFile = null; + protected ArrayList testComparators = null; + protected String thisTestCaseName = null; + protected ComparatorData comparatorData = null; + protected Configuration conf = null; + protected String clitestDataDir = null; + protected String username = null; + + /** + * Read the test config file - testConfig.xml + */ + protected void readTestConfigFile() { + String testConfigFile = getTestFile(); + if (testsFromConfigFile == null) { + boolean success = false; + testConfigFile = TEST_CACHE_DATA_DIR + File.separator + testConfigFile; + try { + SAXParser p = (SAXParserFactory.newInstance()).newSAXParser(); + p.parse(testConfigFile, new TestConfigFileParser()); + success = true; + } catch (Exception e) { + LOG.info("File: " + testConfigFile + " not found"); + success = false; + } + assertTrue("Error reading test config file", success); + } + } + + protected String getTestFile() { + return "testConf.xml"; + } + + /* + * Setup + */ + public void setUp() throws Exception { + // Read the testConfig.xml file + readTestConfigFile(); + + conf = new Configuration(); + conf.setBoolean(ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, + true); + + clitestDataDir = new File(TEST_CACHE_DATA_DIR). + toURI().toString().replace(' ', '+'); + } + + /** + * Tear down + */ + public void tearDown() throws Exception { + displayResults(); + } + + /** + * Expand the commands from the test config xml file + * @param cmd + * @return String expanded command + */ + protected String expandCommand(final String cmd) { + String expCmd = cmd; + expCmd = expCmd.replaceAll("CLITEST_DATA", clitestDataDir); + expCmd = expCmd.replaceAll("USERNAME", username); + + return expCmd; + } + + /** + * Display the summarized results + */ + private void displayResults() { + LOG.info("Detailed results:"); + LOG.info("----------------------------------\n"); + + for (int i = 0; i < testsFromConfigFile.size(); i++) { + CLITestData td = testsFromConfigFile.get(i); + + boolean testResult = td.getTestResult(); + + // Display the details only if there is a failure + if (!testResult) { + LOG.info("-------------------------------------------"); + LOG.info(" Test ID: [" + (i + 1) + "]"); + LOG.info(" Test Description: [" + td.getTestDesc() + "]"); + LOG.info(""); + + ArrayList testCommands = td.getTestCommands(); + for (TestCmd cmd : testCommands) { + LOG.info(" Test Commands: [" + + expandCommand(cmd.getCmd()) + "]"); + } + + LOG.info(""); + ArrayList cleanupCommands = td.getCleanupCommands(); + for (TestCmd cmd : cleanupCommands) { + LOG.info(" Cleanup Commands: [" + + expandCommand(cmd.getCmd()) + "]"); + } + + LOG.info(""); + ArrayList compdata = td.getComparatorData(); + for (ComparatorData cd : compdata) { + boolean resultBoolean = cd.getTestResult(); + LOG.info(" Comparator: [" + + cd.getComparatorType() + "]"); + LOG.info(" Comparision result: [" + + (resultBoolean ? "pass" : "fail") + "]"); + LOG.info(" Expected output: [" + + cd.getExpectedOutput() + "]"); + LOG.info(" Actual output: [" + + cd.getActualOutput() + "]"); + } + LOG.info(""); + } + } + + LOG.info("Summary results:"); + LOG.info("----------------------------------\n"); + + boolean overallResults = true; + int totalPass = 0; + int totalFail = 0; + int totalComparators = 0; + for (int i = 0; i < testsFromConfigFile.size(); i++) { + CLITestData td = testsFromConfigFile.get(i); + totalComparators += + testsFromConfigFile.get(i).getComparatorData().size(); + boolean resultBoolean = td.getTestResult(); + if (resultBoolean) { + totalPass ++; + } else { + totalFail ++; + } + overallResults &= resultBoolean; + } + + + LOG.info(" Testing mode: " + testMode); + LOG.info(""); + LOG.info(" Overall result: " + + (overallResults ? "+++ PASS +++" : "--- FAIL ---")); + if ((totalPass + totalFail) == 0) { + LOG.info(" # Tests pass: " + 0); + LOG.info(" # Tests fail: " + 0); + } + else + { + LOG.info(" # Tests pass: " + totalPass + + " (" + (100 * totalPass / (totalPass + totalFail)) + "%)"); + LOG.info(" # Tests fail: " + totalFail + + " (" + (100 * totalFail / (totalPass + totalFail)) + "%)"); + } + + LOG.info(" # Validations done: " + totalComparators + + " (each test may do multiple validations)"); + + LOG.info(""); + LOG.info("Failing tests:"); + LOG.info("--------------"); + int i = 0; + boolean foundTests = false; + for (i = 0; i < testsFromConfigFile.size(); i++) { + boolean resultBoolean = testsFromConfigFile.get(i).getTestResult(); + if (!resultBoolean) { + LOG.info((i + 1) + ": " + + testsFromConfigFile.get(i).getTestDesc()); + foundTests = true; + } + } + if (!foundTests) { + LOG.info("NONE"); + } + + foundTests = false; + LOG.info(""); + LOG.info("Passing tests:"); + LOG.info("--------------"); + for (i = 0; i < testsFromConfigFile.size(); i++) { + boolean resultBoolean = testsFromConfigFile.get(i).getTestResult(); + if (resultBoolean) { + LOG.info((i + 1) + ": " + + testsFromConfigFile.get(i).getTestDesc()); + foundTests = true; + } + } + if (!foundTests) { + LOG.info("NONE"); + } + + assertTrue("One of the tests failed. " + + "See the Detailed results to identify " + + "the command that failed", overallResults); + + } + + /** + * Compare the actual output with the expected output + * @param compdata + * @return + */ + private boolean compareTestOutput(ComparatorData compdata, Result cmdResult) { + // Compare the output based on the comparator + String comparatorType = compdata.getComparatorType(); + Class comparatorClass = null; + + // If testMode is "test", then run the command and compare the output + // If testMode is "nocompare", then run the command and dump the output. + // Do not compare + + boolean compareOutput = false; + + if (testMode.equals(TESTMODE_TEST)) { + try { + // Initialize the comparator class and run its compare method + comparatorClass = Class.forName("org.apache.hadoop.cli.util." + + comparatorType); + ComparatorBase comp = (ComparatorBase) comparatorClass.newInstance(); + compareOutput = comp.compare(cmdResult.getCommandOutput(), + compdata.getExpectedOutput()); + } catch (Exception e) { + LOG.info("Error in instantiating the comparator" + e); + } + } + + return compareOutput; + } + + /*********************************** + ************* TESTS + *********************************/ + + public void testAll() { + LOG.info("TestAll"); + + // Run the tests defined in the testConf.xml config file. + for (int index = 0; index < testsFromConfigFile.size(); index++) { + + CLITestData testdata = (CLITestData) testsFromConfigFile.get(index); + + // Execute the test commands + ArrayList testCommands = testdata.getTestCommands(); + Result cmdResult = null; + for (TestCmd cmd : testCommands) { + try { + cmdResult = execute(cmd); + } catch (Exception e) { + fail(StringUtils.stringifyException(e)); + } + } + + boolean overallTCResult = true; + // Run comparators + ArrayList compdata = testdata.getComparatorData(); + for (ComparatorData cd : compdata) { + final String comptype = cd.getComparatorType(); + + boolean compareOutput = false; + + if (! comptype.equalsIgnoreCase("none")) { + compareOutput = compareTestOutput(cd, cmdResult); + overallTCResult &= compareOutput; + } + + cd.setExitCode(cmdResult.getExitCode()); + cd.setActualOutput(cmdResult.getCommandOutput()); + cd.setTestResult(compareOutput); + } + testdata.setTestResult(overallTCResult); + + // Execute the cleanup commands + ArrayList cleanupCommands = testdata.getCleanupCommands(); + for (TestCmd cmd : cleanupCommands) { + try { + execute(cmd); + } catch (Exception e) { + fail(StringUtils.stringifyException(e)); + } + } + } + } + + protected CommandExecutor.Result execute(TestCmd cmd) throws Exception { + throw new Exception("Unknow type of Test command:"+ cmd.getType()); + } + + /* + * Parser class for the test config xml file + */ + class TestConfigFileParser extends DefaultHandler { + String charString = null; + CLITestData td = null; + ArrayList testCommands = null; + ArrayList cleanupCommands = null; + + @Override + public void startDocument() throws SAXException { + testsFromConfigFile = new ArrayList(); + } + + @Override + public void startElement(String uri, + String localName, + String qName, + Attributes attributes) throws SAXException { + if (qName.equals("test")) { + td = new CLITestData(); + } else if (qName.equals("test-commands")) { + testCommands = new ArrayList(); + } else if (qName.equals("cleanup-commands")) { + cleanupCommands = new ArrayList(); + } else if (qName.equals("comparators")) { + testComparators = new ArrayList(); + } else if (qName.equals("comparator")) { + comparatorData = new ComparatorData(); + } + charString = ""; + } + + @Override + public void endElement(String uri, + String localName, + String qName) throws SAXException { + if (qName.equals("description")) { + td.setTestDesc(charString); + } else if (qName.equals("test-commands")) { + td.setTestCommands(testCommands); + testCommands = null; + } else if (qName.equals("cleanup-commands")) { + td.setCleanupCommands(cleanupCommands); + cleanupCommands = null; + } else if (qName.equals("command")) { + if (testCommands != null) { + testCommands.add(new TestCmd(charString, CommandType.FS)); + } else if (cleanupCommands != null) { + cleanupCommands.add(new TestCmd(charString, CommandType.FS)); + } + } else if (qName.equals("dfs-admin-command")) { + if (testCommands != null) { + testCommands.add(new TestCmd(charString, CommandType.DFSADMIN)); + } else if (cleanupCommands != null) { + cleanupCommands.add(new TestCmd(charString, CommandType.DFSADMIN)); + } + } else if (qName.equals("mr-admin-command")) { + if (testCommands != null) { + testCommands.add(new TestCmd(charString, CommandType.MRADMIN)); + } else if (cleanupCommands != null) { + cleanupCommands.add(new TestCmd(charString, CommandType.MRADMIN)); + } + } else if (qName.equals("archive-command")) { + if (testCommands != null) { + testCommands.add(new TestCmd(charString, CommandType.ARCHIVE)); + } else if (cleanupCommands != null) { + cleanupCommands.add(new TestCmd(charString, CommandType.ARCHIVE)); + } + } else if (qName.equals("comparators")) { + td.setComparatorData(testComparators); + } else if (qName.equals("comparator")) { + testComparators.add(comparatorData); + } else if (qName.equals("type")) { + comparatorData.setComparatorType(charString); + } else if (qName.equals("expected-output")) { + comparatorData.setExpectedOutput(charString); + } else if (qName.equals("test")) { + testsFromConfigFile.add(td); + td = null; + } else if (qName.equals("mode")) { + testMode = charString; + if (!testMode.equals(TESTMODE_NOCOMPARE) && + !testMode.equals(TESTMODE_TEST)) { + testMode = TESTMODE_TEST; + } + } + } + + @Override + public void characters(char[] ch, + int start, + int length) throws SAXException { + String s = new String(ch, start, length); + charString += s; + } + } +} diff --git a/src/test/org/apache/hadoop/cli/testConf.xml b/src/test/org/apache/hadoop/cli/testConf.xml new file mode 100644 index 00000000000..3250aa67a18 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/testConf.xml @@ -0,0 +1,18 @@ + + + + + + test + + + + + + diff --git a/src/test/org/apache/hadoop/cli/testConf.xsl b/src/test/org/apache/hadoop/cli/testConf.xsl new file mode 100644 index 00000000000..09fb0b7a500 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/testConf.xsl @@ -0,0 +1,28 @@ + + + + + + + +

Hadoop DFS command-line tests

+ + + + + + + + + + + + + + +
IDCommandDescription
+ + + + + diff --git a/src/test/org/apache/hadoop/cli/util/CLITestData.java b/src/test/org/apache/hadoop/cli/util/CLITestData.java new file mode 100644 index 00000000000..18a7133218f --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/CLITestData.java @@ -0,0 +1,136 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +import java.util.ArrayList; + +/** + * + * Class to store CLI Test Data + */ +public class CLITestData { + private String testDesc = null; + private ArrayList testCommands = null; + private ArrayList cleanupCommands = null; + private ArrayList comparatorData = null; + private boolean testResult = false; + + public CLITestData() { + + } + + /** + * Class to define Test Command. includes type of the command and command itself + * Valid types FS, DFSADMIN, MRADMIN and ARCHIVE. + */ + static public class TestCmd { + public enum CommandType { + FS, + DFSADMIN, + MRADMIN, + ARCHIVE + } + private final CommandType type; + private final String cmd; + + public TestCmd(String str, CommandType type) { + cmd = str; + this.type = type; + } + public CommandType getType() { + return type; + } + public String getCmd() { + return cmd; + } + public String toString() { + return cmd; + } + } + + /** + * @return the testDesc + */ + public String getTestDesc() { + return testDesc; + } + + /** + * @param testDesc the testDesc to set + */ + public void setTestDesc(String testDesc) { + this.testDesc = testDesc; + } + + /** + * @return the testCommands + */ + public ArrayList getTestCommands() { + return testCommands; + } + + /** + * @param testCommands the testCommands to set + */ + public void setTestCommands(ArrayList testCommands) { + this.testCommands = testCommands; + } + + /** + * @return the comparatorData + */ + public ArrayList getComparatorData() { + return comparatorData; + } + + /** + * @param comparatorData the comparatorData to set + */ + public void setComparatorData(ArrayList comparatorData) { + this.comparatorData = comparatorData; + } + + /** + * @return the testResult + */ + public boolean getTestResult() { + return testResult; + } + + /** + * @param testResult the testResult to set + */ + public void setTestResult(boolean testResult) { + this.testResult = testResult; + } + + /** + * @return the cleanupCommands + */ + public ArrayList getCleanupCommands() { + return cleanupCommands; + } + + /** + * @param cleanupCommands the cleanupCommands to set + */ + public void setCleanupCommands(ArrayList cleanupCommands) { + this.cleanupCommands = cleanupCommands; + } +} diff --git a/src/test/org/apache/hadoop/cli/util/CommandExecutor.java b/src/test/org/apache/hadoop/cli/util/CommandExecutor.java new file mode 100644 index 00000000000..7a0dc462a06 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/CommandExecutor.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; +import java.util.StringTokenizer; + +import org.apache.hadoop.cli.TestCLI; + +/** + * + * This class execute commands and captures the output + */ +public abstract class CommandExecutor { + protected String[] getCommandAsArgs(final String cmd, final String masterKey, + final String master) { + StringTokenizer tokenizer = new StringTokenizer(cmd, " "); + String[] args = new String[tokenizer.countTokens()]; + + int i = 0; + while (tokenizer.hasMoreTokens()) { + args[i] = tokenizer.nextToken(); + + args[i] = args[i].replaceAll(masterKey, master); + args[i] = args[i].replaceAll("CLITEST_DATA", + new File(TestCLI.TEST_CACHE_DATA_DIR). + toURI().toString().replace(' ', '+')); + args[i] = args[i].replaceAll("USERNAME", System.getProperty("user.name")); + + i++; + } + + return args; + } + + public Result executeCommand(final String cmd) throws Exception { + int exitCode = 0; + Exception lastException = null; + + + ByteArrayOutputStream bao = new ByteArrayOutputStream(); + PrintStream origOut = System.out; + PrintStream origErr = System.err; + + System.setOut(new PrintStream(bao)); + System.setErr(new PrintStream(bao)); + + try { + execute(cmd); + } catch (Exception e) { + e.printStackTrace(); + lastException = e; + exitCode = -1; + } finally { + System.setOut(origOut); + System.setErr(origErr); + } + return new Result(bao.toString(), exitCode, lastException, cmd); + } + + protected abstract void execute(final String cmd) throws Exception; + + public static class Result { + final String commandOutput; + final int exitCode; + final Exception exception; + final String cmdExecuted; + public Result(String commandOutput, int exitCode, Exception exception, + String cmdExecuted) { + this.commandOutput = commandOutput; + this.exitCode = exitCode; + this.exception = exception; + this.cmdExecuted = cmdExecuted; + } + + public String getCommandOutput() { + return commandOutput; + } + + public int getExitCode() { + return exitCode; + } + + public Exception getException() { + return exception; + } + + public String getCommand() { + return cmdExecuted; + } + } + +} diff --git a/src/test/org/apache/hadoop/cli/util/ComparatorBase.java b/src/test/org/apache/hadoop/cli/util/ComparatorBase.java new file mode 100644 index 00000000000..fae99377a42 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/ComparatorBase.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +/** + * + * Comparator interface. To define a new comparator, implement the compare + * method + */ +public abstract class ComparatorBase { + public ComparatorBase() { + + } + + /** + * Compare method for the comparator class. + * @param actual output. can be null + * @param expected output. can be null + * @return true if expected output compares with the actual output, else + * return false. If actual or expected is null, return false + */ + public abstract boolean compare(String actual, String expected); +} diff --git a/src/test/org/apache/hadoop/cli/util/ComparatorData.java b/src/test/org/apache/hadoop/cli/util/ComparatorData.java new file mode 100644 index 00000000000..1b24777e4c5 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/ComparatorData.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +/** + * + * Class to store CLI Test Comparators Data + */ +public class ComparatorData { + private String expectedOutput = null; + private String actualOutput = null; + private boolean testResult = false; + private int exitCode = 0; + private String comparatorType = null; + + public ComparatorData() { + + } + + /** + * @return the expectedOutput + */ + public String getExpectedOutput() { + return expectedOutput; + } + + /** + * @param expectedOutput the expectedOutput to set + */ + public void setExpectedOutput(String expectedOutput) { + this.expectedOutput = expectedOutput; + } + + /** + * @return the actualOutput + */ + public String getActualOutput() { + return actualOutput; + } + + /** + * @param actualOutput the actualOutput to set + */ + public void setActualOutput(String actualOutput) { + this.actualOutput = actualOutput; + } + + /** + * @return the testResult + */ + public boolean getTestResult() { + return testResult; + } + + /** + * @param testResult the testResult to set + */ + public void setTestResult(boolean testResult) { + this.testResult = testResult; + } + + /** + * @return the exitCode + */ + public int getExitCode() { + return exitCode; + } + + /** + * @param exitCode the exitCode to set + */ + public void setExitCode(int exitCode) { + this.exitCode = exitCode; + } + + /** + * @return the comparatorType + */ + public String getComparatorType() { + return comparatorType; + } + + /** + * @param comparatorType the comparatorType to set + */ + public void setComparatorType(String comparatorType) { + this.comparatorType = comparatorType; + } + +} diff --git a/src/test/org/apache/hadoop/cli/util/ExactComparator.java b/src/test/org/apache/hadoop/cli/util/ExactComparator.java new file mode 100644 index 00000000000..9a49a960ce0 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/ExactComparator.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +/** + * Comparator for the Command line tests. + * + * This comparator compares the actual to the expected and + * returns true only if they are the same + * + */ +public class ExactComparator extends ComparatorBase { + + @Override + public boolean compare(String actual, String expected) { + return actual.equals(expected); + } +} diff --git a/src/test/org/apache/hadoop/cli/util/RegexpAcrossOutputComparator.java b/src/test/org/apache/hadoop/cli/util/RegexpAcrossOutputComparator.java new file mode 100644 index 00000000000..9285bde9454 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/RegexpAcrossOutputComparator.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +import java.util.regex.Pattern; + +/** + * Comparator for command line tests that attempts to find a regexp + * within the entire text returned by a command. + * + * This comparator differs from RegexpComparator in that it attempts + * to match the pattern within all of the text returned by the command, + * rather than matching against each line of the returned text. This + * allows matching against patterns that span multiple lines. + */ +public class RegexpAcrossOutputComparator extends ComparatorBase { + + @Override + public boolean compare(String actual, String expected) { + return Pattern.compile(expected).matcher(actual).find(); + } + +} diff --git a/src/test/org/apache/hadoop/cli/util/RegexpComparator.java b/src/test/org/apache/hadoop/cli/util/RegexpComparator.java new file mode 100644 index 00000000000..f2477466c12 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/RegexpComparator.java @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Comparator for the Command line tests. + * + * This comparator searches for the regular expression specified in 'expected' + * in the string 'actual' and returns true if the regular expression match is + * done + * + */ +public class RegexpComparator extends ComparatorBase { + + @Override + public boolean compare(String actual, String expected) { + boolean success = false; + Pattern p = Pattern.compile(expected); + + StringTokenizer tokenizer = new StringTokenizer(actual, "\n\r"); + while (tokenizer.hasMoreTokens() && !success) { + String actualToken = tokenizer.nextToken(); + Matcher m = p.matcher(actualToken); + success = m.matches(); + } + + return success; + } + +} diff --git a/src/test/org/apache/hadoop/cli/util/SubstringComparator.java b/src/test/org/apache/hadoop/cli/util/SubstringComparator.java new file mode 100644 index 00000000000..79e9a889fd8 --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/SubstringComparator.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +public class SubstringComparator extends ComparatorBase { + + @Override + public boolean compare(String actual, String expected) { + int compareOutput = actual.indexOf(expected); + if (compareOutput == -1) { + return false; + } + + return true; + } + +} diff --git a/src/test/org/apache/hadoop/cli/util/TokenComparator.java b/src/test/org/apache/hadoop/cli/util/TokenComparator.java new file mode 100644 index 00000000000..ce5b8468c5b --- /dev/null +++ b/src/test/org/apache/hadoop/cli/util/TokenComparator.java @@ -0,0 +1,49 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.cli.util; + +import java.util.StringTokenizer; + +/** + * Comparator for the Command line tests. + * + * This comparator compares each token in the expected output and returns true + * if all tokens are in the actual output + * + */ +public class TokenComparator extends ComparatorBase { + + @Override + public boolean compare(String actual, String expected) { + boolean compareOutput = true; + + StringTokenizer tokenizer = new StringTokenizer(expected, ",\n\r"); + + while (tokenizer.hasMoreTokens()) { + String token = tokenizer.nextToken(); + if (actual.indexOf(token) != -1) { + compareOutput &= true; + } else { + compareOutput &= false; + } + } + + return compareOutput; + } +} diff --git a/src/test/org/apache/hadoop/conf/TestConfiguration.java b/src/test/org/apache/hadoop/conf/TestConfiguration.java new file mode 100644 index 00000000000..e509fd34641 --- /dev/null +++ b/src/test/org/apache/hadoop/conf/TestConfiguration.java @@ -0,0 +1,392 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.conf; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.DataInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ByteArrayInputStream; +import java.io.DataOutputStream; +import java.util.ArrayList; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.hadoop.fs.Path; + + +public class TestConfiguration extends TestCase { + + private Configuration conf; + final static String CONFIG = new File("./test-config.xml").getAbsolutePath(); + final static String CONFIG2 = new File("./test-config2.xml").getAbsolutePath(); + final static Random RAN = new Random(); + + @Override + protected void setUp() throws Exception { + super.setUp(); + conf = new Configuration(); + } + + @Override + protected void tearDown() throws Exception { + super.tearDown(); + new File(CONFIG).delete(); + new File(CONFIG2).delete(); + } + + private void startConfig() throws IOException{ + out.write("\n"); + out.write("\n"); + } + + private void endConfig() throws IOException{ + out.write("\n"); + out.close(); + } + + private void addInclude(String filename) throws IOException{ + out.write("\n "); + } + + public void testVariableSubstitution() throws IOException { + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + declareProperty("my.int", "${intvar}", "42"); + declareProperty("intvar", "42", "42"); + declareProperty("my.base", "/tmp/${user.name}", UNSPEC); + declareProperty("my.file", "hello", "hello"); + declareProperty("my.suffix", ".txt", ".txt"); + declareProperty("my.relfile", "${my.file}${my.suffix}", "hello.txt"); + declareProperty("my.fullfile", "${my.base}/${my.file}${my.suffix}", UNSPEC); + // check that undefined variables are returned as-is + declareProperty("my.failsexpand", "a${my.undefvar}b", "a${my.undefvar}b"); + endConfig(); + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + + for (Prop p : props) { + System.out.println("p=" + p.name); + String gotVal = conf.get(p.name); + String gotRawVal = conf.getRaw(p.name); + assertEq(p.val, gotRawVal); + if (p.expectEval == UNSPEC) { + // expansion is system-dependent (uses System properties) + // can't do exact match so just check that all variables got expanded + assertTrue(gotVal != null && -1 == gotVal.indexOf("${")); + } else { + assertEq(p.expectEval, gotVal); + } + } + + // check that expansion also occurs for getInt() + assertTrue(conf.getInt("intvar", -1) == 42); + assertTrue(conf.getInt("my.int", -1) == 42); + } + + public static void assertEq(Object a, Object b) { + System.out.println("assertEq: " + a + ", " + b); + assertEquals(a, b); + } + + static class Prop { + String name; + String val; + String expectEval; + } + + final String UNSPEC = null; + ArrayList props = new ArrayList(); + + void declareProperty(String name, String val, String expectEval) + throws IOException { + declareProperty(name, val, expectEval, false); + } + + void declareProperty(String name, String val, String expectEval, + boolean isFinal) + throws IOException { + appendProperty(name, val, isFinal); + Prop p = new Prop(); + p.name = name; + p.val = val; + p.expectEval = expectEval; + props.add(p); + } + + void appendProperty(String name, String val) throws IOException { + appendProperty(name, val, false); + } + + void appendProperty(String name, String val, boolean isFinal) + throws IOException { + out.write(""); + out.write(""); + out.write(name); + out.write(""); + out.write(""); + out.write(val); + out.write(""); + if (isFinal) { + out.write("true"); + } + out.write("\n"); + } + + public void testOverlay() throws IOException{ + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + appendProperty("a","b"); + appendProperty("b","c"); + appendProperty("d","e"); + appendProperty("e","f", true); + endConfig(); + + out=new BufferedWriter(new FileWriter(CONFIG2)); + startConfig(); + appendProperty("a","b"); + appendProperty("b","d"); + appendProperty("e","e"); + endConfig(); + + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + + //set dynamically something + conf.set("c","d"); + conf.set("a","d"); + + Configuration clone=new Configuration(conf); + clone.addResource(new Path(CONFIG2)); + + assertEquals(clone.get("a"), "d"); + assertEquals(clone.get("b"), "d"); + assertEquals(clone.get("c"), "d"); + assertEquals(clone.get("d"), "e"); + assertEquals(clone.get("e"), "f"); + + } + + public void testCommentsInValue() throws IOException { + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + appendProperty("my.comment", "this contains a comment"); + endConfig(); + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + //two spaces one after "this", one before "contains" + assertEquals("this contains a comment", conf.get("my.comment")); + } + + public void testTrim() throws IOException { + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + String[] whitespaces = {"", " ", "\n", "\t"}; + String[] name = new String[100]; + for(int i = 0; i < name.length; i++) { + name[i] = "foo" + i; + StringBuilder prefix = new StringBuilder(); + StringBuilder postfix = new StringBuilder(); + for(int j = 0; j < 3; j++) { + prefix.append(whitespaces[RAN.nextInt(whitespaces.length)]); + postfix.append(whitespaces[RAN.nextInt(whitespaces.length)]); + } + + appendProperty(prefix + name[i] + postfix, name[i] + ".value"); + } + endConfig(); + + conf.addResource(new Path(CONFIG)); + for(String n : name) { + assertEquals(n + ".value", conf.get(n)); + } + } + + public void testToString() throws IOException { + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + endConfig(); + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + + String expectedOutput = + "Configuration: core-default.xml, core-site.xml, " + + fileResource.toString(); + assertEquals(expectedOutput, conf.toString()); + } + + public void testIncludes() throws Exception { + tearDown(); + System.out.println("XXX testIncludes"); + out=new BufferedWriter(new FileWriter(CONFIG2)); + startConfig(); + appendProperty("a","b"); + appendProperty("c","d"); + endConfig(); + + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + addInclude(CONFIG2); + appendProperty("e","f"); + appendProperty("g","h"); + endConfig(); + + // verify that the includes file contains all properties + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + assertEquals(conf.get("a"), "b"); + assertEquals(conf.get("c"), "d"); + assertEquals(conf.get("e"), "f"); + assertEquals(conf.get("g"), "h"); + tearDown(); + } + + BufferedWriter out; + + public void testIntegerRanges() { + Configuration conf = new Configuration(); + conf.set("first", "-100"); + conf.set("second", "4-6,9-10,27"); + conf.set("third", "34-"); + Configuration.IntegerRanges range = conf.getRange("first", null); + System.out.println("first = " + range); + assertEquals(true, range.isIncluded(0)); + assertEquals(true, range.isIncluded(1)); + assertEquals(true, range.isIncluded(100)); + assertEquals(false, range.isIncluded(101)); + range = conf.getRange("second", null); + System.out.println("second = " + range); + assertEquals(false, range.isIncluded(3)); + assertEquals(true, range.isIncluded(4)); + assertEquals(true, range.isIncluded(6)); + assertEquals(false, range.isIncluded(7)); + assertEquals(false, range.isIncluded(8)); + assertEquals(true, range.isIncluded(9)); + assertEquals(true, range.isIncluded(10)); + assertEquals(false, range.isIncluded(11)); + assertEquals(false, range.isIncluded(26)); + assertEquals(true, range.isIncluded(27)); + assertEquals(false, range.isIncluded(28)); + range = conf.getRange("third", null); + System.out.println("third = " + range); + assertEquals(false, range.isIncluded(33)); + assertEquals(true, range.isIncluded(34)); + assertEquals(true, range.isIncluded(100000000)); + } + + public void testHexValues() throws IOException{ + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + appendProperty("test.hex1", "0x10"); + appendProperty("test.hex2", "0xF"); + appendProperty("test.hex3", "-0x10"); + endConfig(); + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + assertEquals(16, conf.getInt("test.hex1", 0)); + assertEquals(16, conf.getLong("test.hex1", 0)); + assertEquals(15, conf.getInt("test.hex2", 0)); + assertEquals(15, conf.getLong("test.hex2", 0)); + assertEquals(-16, conf.getInt("test.hex3", 0)); + assertEquals(-16, conf.getLong("test.hex3", 0)); + + } + + public void testIntegerValues() throws IOException{ + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + appendProperty("test.int1", "20"); + appendProperty("test.int2", "020"); + appendProperty("test.int3", "-20"); + endConfig(); + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + assertEquals(20, conf.getInt("test.int1", 0)); + assertEquals(20, conf.getLong("test.int1", 0)); + assertEquals(20, conf.getInt("test.int2", 0)); + assertEquals(20, conf.getLong("test.int2", 0)); + assertEquals(-20, conf.getInt("test.int3", 0)); + assertEquals(-20, conf.getLong("test.int3", 0)); + } + + public void testReload() throws IOException { + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + appendProperty("test.key1", "final-value1", true); + appendProperty("test.key2", "value2"); + endConfig(); + Path fileResource = new Path(CONFIG); + conf.addResource(fileResource); + + out=new BufferedWriter(new FileWriter(CONFIG2)); + startConfig(); + appendProperty("test.key1", "value1"); + appendProperty("test.key3", "value3"); + endConfig(); + Path fileResource1 = new Path(CONFIG2); + conf.addResource(fileResource1); + + // add a few values via set. + conf.set("test.key3", "value4"); + conf.set("test.key4", "value5"); + + assertEquals("final-value1", conf.get("test.key1")); + assertEquals("value2", conf.get("test.key2")); + assertEquals("value4", conf.get("test.key3")); + assertEquals("value5", conf.get("test.key4")); + + // change values in the test file... + out=new BufferedWriter(new FileWriter(CONFIG)); + startConfig(); + appendProperty("test.key1", "final-value1"); + appendProperty("test.key3", "final-value3", true); + endConfig(); + + conf.reloadConfiguration(); + assertEquals("value1", conf.get("test.key1")); + // overlayed property overrides. + assertEquals("value4", conf.get("test.key3")); + assertEquals(null, conf.get("test.key2")); + assertEquals("value5", conf.get("test.key4")); + } + + public void testSize() throws IOException { + Configuration conf = new Configuration(false); + conf.set("a", "A"); + conf.set("b", "B"); + assertEquals(2, conf.size()); + } + + public void testClear() throws IOException { + Configuration conf = new Configuration(false); + conf.set("a", "A"); + conf.set("b", "B"); + conf.clear(); + assertEquals(0, conf.size()); + assertFalse(conf.iterator().hasNext()); + } + + public static void main(String[] argv) throws Exception { + junit.textui.TestRunner.main(new String[]{ + TestConfiguration.class.getName() + }); + } +} diff --git a/src/test/org/apache/hadoop/conf/TestConfigurationSubclass.java b/src/test/org/apache/hadoop/conf/TestConfigurationSubclass.java new file mode 100644 index 00000000000..fd2fa38967e --- /dev/null +++ b/src/test/org/apache/hadoop/conf/TestConfigurationSubclass.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.conf; + +import junit.framework.TestCase; + +import java.util.Properties; + +/** + * Created 21-Jan-2009 13:42:36 + */ + +public class TestConfigurationSubclass extends TestCase { + private static final String EMPTY_CONFIGURATION_XML + = "/org/apache/hadoop/conf/empty-configuration.xml"; + + + public void testGetProps() { + SubConf conf = new SubConf(true); + Properties properties = conf.getProperties(); + assertNotNull("hadoop.tmp.dir is not set", + properties.getProperty("hadoop.tmp.dir")); + } + + public void testReload() throws Throwable { + SubConf conf = new SubConf(true); + assertFalse(conf.isReloaded()); + Configuration.addDefaultResource(EMPTY_CONFIGURATION_XML); + assertTrue(conf.isReloaded()); + Properties properties = conf.getProperties(); + } + + public void testReloadNotQuiet() throws Throwable { + SubConf conf = new SubConf(true); + conf.setQuietMode(false); + assertFalse(conf.isReloaded()); + conf.addResource("not-a-valid-resource"); + assertTrue(conf.isReloaded()); + try { + Properties properties = conf.getProperties(); + fail("Should not have got here"); + } catch (RuntimeException e) { + assertTrue(e.toString(),e.getMessage().contains("not found")); + } + } + + private static class SubConf extends Configuration { + + private boolean reloaded; + + /** + * A new configuration where the behavior of reading from the default resources + * can be turned off. + * + * If the parameter {@code loadDefaults} is false, the new instance will not + * load resources from the default files. + * + * @param loadDefaults specifies whether to load from the default files + */ + private SubConf(boolean loadDefaults) { + super(loadDefaults); + } + + public Properties getProperties() { + return super.getProps(); + } + + /** + * {@inheritDoc}. + * Sets the reloaded flag. + */ + @Override + public void reloadConfiguration() { + super.reloadConfiguration(); + reloaded = true; + } + + public boolean isReloaded() { + return reloaded; + } + + public void setReloaded(boolean reloaded) { + this.reloaded = reloaded; + } + } + +} diff --git a/src/test/org/apache/hadoop/conf/TestGetInstances.java b/src/test/org/apache/hadoop/conf/TestGetInstances.java new file mode 100644 index 00000000000..57b7ff45198 --- /dev/null +++ b/src/test/org/apache/hadoop/conf/TestGetInstances.java @@ -0,0 +1,74 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.conf; + +import java.util.List; + +import junit.framework.TestCase; + +public class TestGetInstances extends TestCase { + + interface SampleInterface {} + + interface ChildInterface extends SampleInterface {} + + static class SampleClass implements SampleInterface { + SampleClass() {} + } + + static class AnotherClass implements ChildInterface { + AnotherClass() {} + } + + /** + * Makes sure Configuration.getInstances() returns + * instances of the required type. + */ + public void testGetInstances() throws Exception { + Configuration conf = new Configuration(); + + List classes = + conf.getInstances("no.such.property", SampleInterface.class); + assertTrue(classes.isEmpty()); + + conf.set("empty.property", ""); + classes = conf.getInstances("empty.property", SampleInterface.class); + assertTrue(classes.isEmpty()); + + conf.setStrings("some.classes", + SampleClass.class.getName(), AnotherClass.class.getName()); + classes = conf.getInstances("some.classes", SampleInterface.class); + assertEquals(2, classes.size()); + + try { + conf.setStrings("some.classes", + SampleClass.class.getName(), AnotherClass.class.getName(), + String.class.getName()); + conf.getInstances("some.classes", SampleInterface.class); + fail("java.lang.String does not implement SampleInterface"); + } catch (RuntimeException e) {} + + try { + conf.setStrings("some.classes", + SampleClass.class.getName(), AnotherClass.class.getName(), + "no.such.Class"); + conf.getInstances("some.classes", SampleInterface.class); + fail("no.such.Class does not exist"); + } catch (RuntimeException e) {} + } +} diff --git a/src/test/org/apache/hadoop/conf/empty-configuration.xml b/src/test/org/apache/hadoop/conf/empty-configuration.xml new file mode 100644 index 00000000000..a2086fa683f --- /dev/null +++ b/src/test/org/apache/hadoop/conf/empty-configuration.xml @@ -0,0 +1,4 @@ + + + + diff --git a/src/test/org/apache/hadoop/filecache/TestDistributedCache.java b/src/test/org/apache/hadoop/filecache/TestDistributedCache.java new file mode 100644 index 00000000000..2da7f0bc145 --- /dev/null +++ b/src/test/org/apache/hadoop/filecache/TestDistributedCache.java @@ -0,0 +1,77 @@ +package org.apache.hadoop.filecache; + +import java.io.IOException; +import java.net.URI; +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import junit.framework.TestCase; + +public class TestDistributedCache extends TestCase { + + static final URI LOCAL_FS = URI.create("file:///"); + private static String TEST_CACHE_BASE_DIR = + new Path(System.getProperty("test.build.data","/tmp/cachebasedir")) + .toString().replace(' ', '+'); + private static String TEST_ROOT_DIR = + System.getProperty("test.build.data", "/tmp/distributedcache"); + private static final int TEST_FILE_SIZE = 4 * 1024; // 4K + private static final int LOCAL_CACHE_LIMIT = 5 * 1024; //5K + private Configuration conf; + private Path firstCacheFile; + private Path secondCacheFile; + private FileSystem localfs; + + /** + * @see TestCase#setUp() + */ + @Override + protected void setUp() throws IOException { + conf = new Configuration(); + conf.setLong("local.cache.size", LOCAL_CACHE_LIMIT); + localfs = FileSystem.get(LOCAL_FS, conf); + firstCacheFile = new Path(TEST_ROOT_DIR+"/firstcachefile"); + secondCacheFile = new Path(TEST_ROOT_DIR+"/secondcachefile"); + createTempFile(localfs, firstCacheFile); + createTempFile(localfs, secondCacheFile); + } + + /** test delete cache */ + public void testDeleteCache() throws Exception { + DistributedCache.getLocalCache(firstCacheFile.toUri(), conf, new Path(TEST_CACHE_BASE_DIR), + false, System.currentTimeMillis(), new Path(TEST_ROOT_DIR)); + DistributedCache.releaseCache(firstCacheFile.toUri(), conf); + //in above code,localized a file of size 4K and then release the cache which will cause the cache + //be deleted when the limit goes out. The below code localize another cache which's designed to + //sweep away the first cache. + DistributedCache.getLocalCache(secondCacheFile.toUri(), conf, new Path(TEST_CACHE_BASE_DIR), + false, System.currentTimeMillis(), new Path(TEST_ROOT_DIR)); + FileStatus[] dirStatuses = localfs.listStatus(new Path(TEST_CACHE_BASE_DIR)); + assertTrue("DistributedCache failed deleting old cache when the cache store is full.", + dirStatuses.length > 1); + } + + private void createTempFile(FileSystem fs, Path p) throws IOException { + FSDataOutputStream out = fs.create(p); + byte[] toWrite = new byte[TEST_FILE_SIZE]; + new Random().nextBytes(toWrite); + out.write(toWrite); + out.close(); + FileSystem.LOG.info("created: " + p + ", size=" + TEST_FILE_SIZE); + } + + /** + * @see TestCase#tearDown() + */ + @Override + protected void tearDown() throws IOException { + localfs.delete(firstCacheFile, true); + localfs.delete(secondCacheFile, true); + localfs.close(); + } +} diff --git a/src/test/org/apache/hadoop/fs/FileSystemContractBaseTest.java b/src/test/org/apache/hadoop/fs/FileSystemContractBaseTest.java new file mode 100644 index 00000000000..8bdeb3bfd7d --- /dev/null +++ b/src/test/org/apache/hadoop/fs/FileSystemContractBaseTest.java @@ -0,0 +1,471 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import java.io.FileNotFoundException; +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + *

+ * A collection of tests for the contract of the {@link FileSystem}. + * This test should be used for general-purpose implementations of + * {@link FileSystem}, that is, implementations that provide implementations + * of all of the functionality of {@link FileSystem}. + *

+ *

+ * To test a given {@link FileSystem} implementation create a subclass of this + * test and override {@link #setUp()} to initialize the fs + * {@link FileSystem} instance variable. + *

+ */ +public abstract class FileSystemContractBaseTest extends TestCase { + + protected FileSystem fs; + private byte[] data = new byte[getBlockSize() * 2]; // two blocks of data + { + for (int i = 0; i < data.length; i++) { + data[i] = (byte) (i % 10); + } + } + + @Override + protected void tearDown() throws Exception { + fs.delete(path("/test"), true); + } + + protected int getBlockSize() { + return 1024; + } + + protected String getDefaultWorkingDirectory() { + return "/user/" + System.getProperty("user.name"); + } + + protected boolean renameSupported() { + return true; + } + + public void testFsStatus() throws Exception { + FsStatus fsStatus = fs.getStatus(); + assertNotNull(fsStatus); + //used, free and capacity are non-negative longs + assertTrue(fsStatus.getUsed() >= 0); + assertTrue(fsStatus.getRemaining() >= 0); + assertTrue(fsStatus.getCapacity() >= 0); + } + + public void testWorkingDirectory() throws Exception { + + Path workDir = path(getDefaultWorkingDirectory()); + assertEquals(workDir, fs.getWorkingDirectory()); + + fs.setWorkingDirectory(path(".")); + assertEquals(workDir, fs.getWorkingDirectory()); + + fs.setWorkingDirectory(path("..")); + assertEquals(workDir.getParent(), fs.getWorkingDirectory()); + + Path relativeDir = path("hadoop"); + fs.setWorkingDirectory(relativeDir); + assertEquals(relativeDir, fs.getWorkingDirectory()); + + Path absoluteDir = path("/test/hadoop"); + fs.setWorkingDirectory(absoluteDir); + assertEquals(absoluteDir, fs.getWorkingDirectory()); + + } + + public void testMkdirs() throws Exception { + Path testDir = path("/test/hadoop"); + assertFalse(fs.exists(testDir)); + assertFalse(fs.isFile(testDir)); + + assertTrue(fs.mkdirs(testDir)); + + assertTrue(fs.exists(testDir)); + assertFalse(fs.isFile(testDir)); + + assertTrue(fs.mkdirs(testDir)); + + assertTrue(fs.exists(testDir)); + assertFalse(fs.isFile(testDir)); + + Path parentDir = testDir.getParent(); + assertTrue(fs.exists(parentDir)); + assertFalse(fs.isFile(parentDir)); + + Path grandparentDir = parentDir.getParent(); + assertTrue(fs.exists(grandparentDir)); + assertFalse(fs.isFile(grandparentDir)); + + } + + public void testMkdirsFailsForSubdirectoryOfExistingFile() throws Exception { + Path testDir = path("/test/hadoop"); + assertFalse(fs.exists(testDir)); + assertTrue(fs.mkdirs(testDir)); + assertTrue(fs.exists(testDir)); + + createFile(path("/test/hadoop/file")); + + Path testSubDir = path("/test/hadoop/file/subdir"); + try { + fs.mkdirs(testSubDir); + fail("Should throw IOException."); + } catch (IOException e) { + // expected + } + assertFalse(fs.exists(testSubDir)); + + Path testDeepSubDir = path("/test/hadoop/file/deep/sub/dir"); + try { + fs.mkdirs(testDeepSubDir); + fail("Should throw IOException."); + } catch (IOException e) { + // expected + } + assertFalse(fs.exists(testDeepSubDir)); + + } + + public void testGetFileStatusThrowsExceptionForNonExistentFile() + throws Exception { + try { + fs.getFileStatus(path("/test/hadoop/file")); + fail("Should throw FileNotFoundException"); + } catch (FileNotFoundException e) { + // expected + } + } + + public void testListStatusReturnsNullForNonExistentFile() throws Exception { + assertNull(fs.listStatus(path("/test/hadoop/file"))); + } + + public void testListStatus() throws Exception { + Path[] testDirs = { path("/test/hadoop/a"), + path("/test/hadoop/b"), + path("/test/hadoop/c/1"), }; + assertFalse(fs.exists(testDirs[0])); + + for (Path path : testDirs) { + assertTrue(fs.mkdirs(path)); + } + + FileStatus[] paths = fs.listStatus(path("/test")); + assertEquals(1, paths.length); + assertEquals(path("/test/hadoop"), paths[0].getPath()); + + paths = fs.listStatus(path("/test/hadoop")); + assertEquals(3, paths.length); + assertEquals(path("/test/hadoop/a"), paths[0].getPath()); + assertEquals(path("/test/hadoop/b"), paths[1].getPath()); + assertEquals(path("/test/hadoop/c"), paths[2].getPath()); + + paths = fs.listStatus(path("/test/hadoop/a")); + assertEquals(0, paths.length); + } + + public void testWriteReadAndDeleteEmptyFile() throws Exception { + writeReadAndDelete(0); + } + + public void testWriteReadAndDeleteHalfABlock() throws Exception { + writeReadAndDelete(getBlockSize() / 2); + } + + public void testWriteReadAndDeleteOneBlock() throws Exception { + writeReadAndDelete(getBlockSize()); + } + + public void testWriteReadAndDeleteOneAndAHalfBlocks() throws Exception { + writeReadAndDelete(getBlockSize() + (getBlockSize() / 2)); + } + + public void testWriteReadAndDeleteTwoBlocks() throws Exception { + writeReadAndDelete(getBlockSize() * 2); + } + + private void writeReadAndDelete(int len) throws IOException { + Path path = path("/test/hadoop/file"); + + fs.mkdirs(path.getParent()); + + FSDataOutputStream out = fs.create(path, false, + fs.getConf().getInt("io.file.buffer.size", 4096), + (short) 1, getBlockSize()); + out.write(data, 0, len); + out.close(); + + assertTrue("Exists", fs.exists(path)); + assertEquals("Length", len, fs.getFileStatus(path).getLen()); + + FSDataInputStream in = fs.open(path); + byte[] buf = new byte[len]; + in.readFully(0, buf); + in.close(); + + assertEquals(len, buf.length); + for (int i = 0; i < buf.length; i++) { + assertEquals("Position " + i, data[i], buf[i]); + } + + assertTrue("Deleted", fs.delete(path, false)); + + assertFalse("No longer exists", fs.exists(path)); + + } + + public void testOverwrite() throws IOException { + Path path = path("/test/hadoop/file"); + + fs.mkdirs(path.getParent()); + + createFile(path); + + assertTrue("Exists", fs.exists(path)); + assertEquals("Length", data.length, fs.getFileStatus(path).getLen()); + + try { + fs.create(path, false); + fail("Should throw IOException."); + } catch (IOException e) { + // Expected + } + + FSDataOutputStream out = fs.create(path, true); + out.write(data, 0, data.length); + out.close(); + + assertTrue("Exists", fs.exists(path)); + assertEquals("Length", data.length, fs.getFileStatus(path).getLen()); + + } + + public void testWriteInNonExistentDirectory() throws IOException { + Path path = path("/test/hadoop/file"); + assertFalse("Parent doesn't exist", fs.exists(path.getParent())); + createFile(path); + + assertTrue("Exists", fs.exists(path)); + assertEquals("Length", data.length, fs.getFileStatus(path).getLen()); + assertTrue("Parent exists", fs.exists(path.getParent())); + } + + public void testDeleteNonExistentFile() throws IOException { + Path path = path("/test/hadoop/file"); + assertFalse("Doesn't exist", fs.exists(path)); + assertFalse("No deletion", fs.delete(path, true)); + } + + public void testDeleteRecursively() throws IOException { + Path dir = path("/test/hadoop"); + Path file = path("/test/hadoop/file"); + Path subdir = path("/test/hadoop/subdir"); + + createFile(file); + assertTrue("Created subdir", fs.mkdirs(subdir)); + + assertTrue("File exists", fs.exists(file)); + assertTrue("Dir exists", fs.exists(dir)); + assertTrue("Subdir exists", fs.exists(subdir)); + + try { + fs.delete(dir, false); + fail("Should throw IOException."); + } catch (IOException e) { + // expected + } + assertTrue("File still exists", fs.exists(file)); + assertTrue("Dir still exists", fs.exists(dir)); + assertTrue("Subdir still exists", fs.exists(subdir)); + + assertTrue("Deleted", fs.delete(dir, true)); + assertFalse("File doesn't exist", fs.exists(file)); + assertFalse("Dir doesn't exist", fs.exists(dir)); + assertFalse("Subdir doesn't exist", fs.exists(subdir)); + } + + public void testDeleteEmptyDirectory() throws IOException { + Path dir = path("/test/hadoop"); + assertTrue(fs.mkdirs(dir)); + assertTrue("Dir exists", fs.exists(dir)); + assertTrue("Deleted", fs.delete(dir, false)); + assertFalse("Dir doesn't exist", fs.exists(dir)); + } + + public void testRenameNonExistentPath() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/path"); + Path dst = path("/test/new/newpath"); + rename(src, dst, false, false, false); + } + + public void testRenameFileMoveToNonExistentDirectory() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/file"); + createFile(src); + Path dst = path("/test/new/newfile"); + rename(src, dst, false, true, false); + } + + public void testRenameFileMoveToExistingDirectory() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/file"); + createFile(src); + Path dst = path("/test/new/newfile"); + fs.mkdirs(dst.getParent()); + rename(src, dst, true, false, true); + } + + public void testRenameFileAsExistingFile() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/file"); + createFile(src); + Path dst = path("/test/new/newfile"); + createFile(dst); + rename(src, dst, false, true, true); + } + + public void testRenameFileAsExistingDirectory() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/file"); + createFile(src); + Path dst = path("/test/new/newdir"); + fs.mkdirs(dst); + rename(src, dst, true, false, true); + assertTrue("Destination changed", + fs.exists(path("/test/new/newdir/file"))); + } + + public void testRenameDirectoryMoveToNonExistentDirectory() + throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/dir"); + fs.mkdirs(src); + Path dst = path("/test/new/newdir"); + rename(src, dst, false, true, false); + } + + public void testRenameDirectoryMoveToExistingDirectory() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/dir"); + fs.mkdirs(src); + createFile(path("/test/hadoop/dir/file1")); + createFile(path("/test/hadoop/dir/subdir/file2")); + + Path dst = path("/test/new/newdir"); + fs.mkdirs(dst.getParent()); + rename(src, dst, true, false, true); + + assertFalse("Nested file1 exists", + fs.exists(path("/test/hadoop/dir/file1"))); + assertFalse("Nested file2 exists", + fs.exists(path("/test/hadoop/dir/subdir/file2"))); + assertTrue("Renamed nested file1 exists", + fs.exists(path("/test/new/newdir/file1"))); + assertTrue("Renamed nested exists", + fs.exists(path("/test/new/newdir/subdir/file2"))); + } + + public void testRenameDirectoryAsExistingFile() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/dir"); + fs.mkdirs(src); + Path dst = path("/test/new/newfile"); + createFile(dst); + rename(src, dst, false, true, true); + } + + public void testRenameDirectoryAsExistingDirectory() throws Exception { + if (!renameSupported()) return; + + Path src = path("/test/hadoop/dir"); + fs.mkdirs(src); + createFile(path("/test/hadoop/dir/file1")); + createFile(path("/test/hadoop/dir/subdir/file2")); + + Path dst = path("/test/new/newdir"); + fs.mkdirs(dst); + rename(src, dst, true, false, true); + assertTrue("Destination changed", + fs.exists(path("/test/new/newdir/dir"))); + assertFalse("Nested file1 exists", + fs.exists(path("/test/hadoop/dir/file1"))); + assertFalse("Nested file2 exists", + fs.exists(path("/test/hadoop/dir/subdir/file2"))); + assertTrue("Renamed nested file1 exists", + fs.exists(path("/test/new/newdir/dir/file1"))); + assertTrue("Renamed nested exists", + fs.exists(path("/test/new/newdir/dir/subdir/file2"))); + } + + public void testInputStreamClosedTwice() throws IOException { + //HADOOP-4760 according to Closeable#close() closing already-closed + //streams should have no effect. + Path src = path("/test/hadoop/file"); + createFile(src); + FSDataInputStream in = fs.open(src); + in.close(); + in.close(); + } + + public void testOutputStreamClosedTwice() throws IOException { + //HADOOP-4760 according to Closeable#close() closing already-closed + //streams should have no effect. + Path src = path("/test/hadoop/file"); + FSDataOutputStream out = fs.create(src); + out.writeChar('H'); //write some data + out.close(); + out.close(); + } + + protected Path path(String pathString) { + return new Path(pathString).makeQualified(fs); + } + + protected void createFile(Path path) throws IOException { + FSDataOutputStream out = fs.create(path); + out.write(data, 0, data.length); + out.close(); + } + + private void rename(Path src, Path dst, boolean renameSucceeded, + boolean srcExists, boolean dstExists) throws IOException { + assertEquals("Rename result", renameSucceeded, fs.rename(src, dst)); + assertEquals("Source exists", srcExists, fs.exists(src)); + assertEquals("Destination exists", dstExists, fs.exists(dst)); + } +} diff --git a/src/test/org/apache/hadoop/fs/TestChecksumFileSystem.java b/src/test/org/apache/hadoop/fs/TestChecksumFileSystem.java new file mode 100644 index 00000000000..c55fc3ae414 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestChecksumFileSystem.java @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import java.net.URI; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.conf.Configuration; +import junit.framework.TestCase; + +public class TestChecksumFileSystem extends TestCase { + public void testgetChecksumLength() throws Exception { + assertEquals(8, ChecksumFileSystem.getChecksumLength(0L, 512)); + assertEquals(12, ChecksumFileSystem.getChecksumLength(1L, 512)); + assertEquals(12, ChecksumFileSystem.getChecksumLength(512L, 512)); + assertEquals(16, ChecksumFileSystem.getChecksumLength(513L, 512)); + assertEquals(16, ChecksumFileSystem.getChecksumLength(1023L, 512)); + assertEquals(16, ChecksumFileSystem.getChecksumLength(1024L, 512)); + assertEquals(408, ChecksumFileSystem.getChecksumLength(100L, 1)); + assertEquals(4000000000008L, + ChecksumFileSystem.getChecksumLength(10000000000000L, 10)); + } + + public void testVerifyChecksum() throws Exception { + String TEST_ROOT_DIR + = System.getProperty("test.build.data","build/test/data/work-dir/localfs"); + + Configuration conf = new Configuration(); + LocalFileSystem localFs = FileSystem.getLocal(conf); + Path testPath = new Path(TEST_ROOT_DIR, "testPath"); + Path testPath11 = new Path(TEST_ROOT_DIR, "testPath11"); + FSDataOutputStream fout = localFs.create(testPath); + fout.write("testing".getBytes()); + fout.close(); + + fout = localFs.create(testPath11); + fout.write("testing you".getBytes()); + fout.close(); + + localFs.delete(localFs.getChecksumFile(testPath), true); + assertTrue("checksum deleted", !localFs.exists(localFs.getChecksumFile(testPath))); + + //copying the wrong checksum file + FileUtil.copy(localFs, localFs.getChecksumFile(testPath11), localFs, + localFs.getChecksumFile(testPath),false,true,conf); + assertTrue("checksum exists", localFs.exists(localFs.getChecksumFile(testPath))); + + boolean errorRead = false; + try { + TestLocalFileSystem.readFile(localFs, testPath); + }catch(ChecksumException ie) { + errorRead = true; + } + assertTrue("error reading", errorRead); + + //now setting verify false, the read should succeed + localFs.setVerifyChecksum(false); + String str = TestLocalFileSystem.readFile(localFs, testPath); + assertTrue("read", "testing".equals(str)); + + } +} diff --git a/src/test/org/apache/hadoop/fs/TestDFVariations.java b/src/test/org/apache/hadoop/fs/TestDFVariations.java new file mode 100644 index 00000000000..3999050069b --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestDFVariations.java @@ -0,0 +1,63 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.hadoop.fs; + +import junit.framework.TestCase; + +import java.io.File; +import java.io.IOException; +import java.util.EnumSet; + +public class TestDFVariations extends TestCase { + + public static class XXDF extends DF { + private final String osName; + public XXDF(String osName) throws IOException { + super(new File(System.getProperty("test.build.data","/tmp")), 0L); + this.osName = osName; + } + @Override + public DF.OSType getOSType() { + return DF.getOSType(osName); + } + @Override + protected String[] getExecString() { + switch(getOSType()) { + case OS_TYPE_AIX: + return new String[] { "echo", "IGNORE\n", "/dev/sda3", + "453115160", "400077240", "11%", "18", "skip%", "/foo/bar", "\n" }; + default: + return new String[] { "echo", "IGNORE\n", "/dev/sda3", + "453115160", "53037920", "400077240", "11%", "/foo/bar", "\n" }; + } + } + } + + public void testOSParsing() throws Exception { + for (DF.OSType ost : EnumSet.allOf(DF.OSType.class)) { + XXDF df = new XXDF(ost.getId()); + assertEquals(ost.getId() + " total", 453115160 * 1024L, df.getCapacity()); + assertEquals(ost.getId() + " used", 53037920 * 1024L, df.getUsed()); + assertEquals(ost.getId() + " avail", 400077240 * 1024L, df.getAvailable()); + assertEquals(ost.getId() + " pcnt used", 11, df.getPercentUsed()); + assertEquals(ost.getId() + " mount", "/foo/bar", df.getMount()); + } + } + +} + diff --git a/src/test/org/apache/hadoop/fs/TestDU.java b/src/test/org/apache/hadoop/fs/TestDU.java new file mode 100644 index 00000000000..6df487be55f --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestDU.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import junit.framework.TestCase; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.Random; + +/** This test makes sure that "DU" does not get to run on each call to getUsed */ +public class TestDU extends TestCase { + final static private File DU_DIR = new File( + System.getProperty("test.build.data","/tmp"), "dutmp"); + + public void setUp() throws IOException { + FileUtil.fullyDelete(DU_DIR); + assertTrue(DU_DIR.mkdirs()); + } + + public void tearDown() throws IOException { + FileUtil.fullyDelete(DU_DIR); + } + + private void createFile(File newFile, int size) throws IOException { + // write random data so that filesystems with compression enabled (e.g., ZFS) + // can't compress the file + Random random = new Random(); + byte[] data = new byte[size]; + random.nextBytes(data); + + newFile.createNewFile(); + RandomAccessFile file = new RandomAccessFile(newFile, "rws"); + + file.write(data); + + file.getFD().sync(); + file.close(); + } + + /** + * Verify that du returns expected used space for a file. + * We assume here that if a file system crates a file of size + * that is a multiple of the block size in this file system, + * then the used size for the file will be exactly that size. + * This is true for most file systems. + * + * @throws IOException + * @throws InterruptedException + */ + public void testDU() throws IOException, InterruptedException { + int writtenSize = 32*1024; // writing 32K + File file = new File(DU_DIR, "data"); + createFile(file, writtenSize); + + Thread.sleep(5000); // let the metadata updater catch up + + DU du = new DU(file, 10000); + du.start(); + long duSize = du.getUsed(); + du.shutdown(); + + assertEquals(writtenSize, duSize); + + //test with 0 interval, will not launch thread + du = new DU(file, 0); + du.start(); + duSize = du.getUsed(); + du.shutdown(); + + assertEquals(writtenSize, duSize); + + //test without launching thread + du = new DU(file, 10000); + duSize = du.getUsed(); + + assertEquals(writtenSize, duSize); + } +} diff --git a/src/test/org/apache/hadoop/fs/TestGetFileBlockLocations.java b/src/test/org/apache/hadoop/fs/TestGetFileBlockLocations.java new file mode 100644 index 00000000000..c85cc988627 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestGetFileBlockLocations.java @@ -0,0 +1,139 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.hadoop.fs; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; + +/** + * Testing the correctness of FileSystem.getFileBlockLocations. + */ +public class TestGetFileBlockLocations extends TestCase { + private static String TEST_ROOT_DIR = + System.getProperty("test.build.data", "/tmp/testGetFileBlockLocations"); + private static final int FileLength = 4 * 1024 * 1024; // 4MB + private Configuration conf; + private Path path; + private FileSystem fs; + private Random random; + + /** + * @see TestCase#setUp() + */ + @Override + protected void setUp() throws IOException { + conf = new Configuration(); + Path rootPath = new Path(TEST_ROOT_DIR); + path = new Path(rootPath, "TestGetFileBlockLocations"); + fs = rootPath.getFileSystem(conf); + FSDataOutputStream fsdos = fs.create(path, true); + byte[] buffer = new byte[1024]; + while (fsdos.getPos() < FileLength) { + fsdos.write(buffer); + } + fsdos.close(); + random = new Random(System.nanoTime()); + } + + private void oneTest(int offBegin, int offEnd, FileStatus status) + throws IOException { + if (offBegin > offEnd) { + int tmp = offBegin; + offBegin = offEnd; + offEnd = tmp; + } + BlockLocation[] locations = + fs.getFileBlockLocations(status, offBegin, offEnd - offBegin); + if (offBegin < status.getLen()) { + Arrays.sort(locations, new Comparator() { + + @Override + public int compare(BlockLocation arg0, BlockLocation arg1) { + long cmprv = arg0.getOffset() - arg1.getOffset(); + if (cmprv < 0) return -1; + if (cmprv > 0) return 1; + cmprv = arg0.getLength() - arg1.getLength(); + if (cmprv < 0) return -1; + if (cmprv > 0) return 1; + return 0; + } + + }); + offBegin = (int) Math.min(offBegin, status.getLen() - 1); + offEnd = (int) Math.min(offEnd, status.getLen()); + BlockLocation first = locations[0]; + BlockLocation last = locations[locations.length - 1]; + assertTrue(first.getOffset() <= offBegin); + assertTrue(offEnd <= last.getOffset() + last.getLength()); + } else { + assertTrue(locations.length == 0); + } + } + /** + * @see TestCase#tearDown() + */ + @Override + protected void tearDown() throws IOException { + fs.delete(path, true); + fs.close(); + } + + public void testFailureNegativeParameters() throws IOException { + FileStatus status = fs.getFileStatus(path); + try { + BlockLocation[] locations = fs.getFileBlockLocations(status, -1, 100); + fail("Expecting exception being throw"); + } catch (IllegalArgumentException e) { + + } + + try { + BlockLocation[] locations = fs.getFileBlockLocations(status, 100, -1); + fail("Expecting exception being throw"); + } catch (IllegalArgumentException e) { + + } + } + + public void testGetFileBlockLocations1() throws IOException { + FileStatus status = fs.getFileStatus(path); + oneTest(0, (int) status.getLen(), status); + oneTest(0, (int) status.getLen() * 2, status); + oneTest((int) status.getLen() * 2, (int) status.getLen() * 4, status); + oneTest((int) status.getLen() / 2, (int) status.getLen() * 3, status); + for (int i = 0; i < 10; ++i) { + oneTest((int) status.getLen() * i / 10, (int) status.getLen() * (i + 1) + / 10, status); + } + } + + public void testGetFileBlockLocations2() throws IOException { + FileStatus status = fs.getFileStatus(path); + for (int i = 0; i < 1000; ++i) { + int offBegin = random.nextInt((int) (2 * status.getLen())); + int offEnd = random.nextInt((int) (2 * status.getLen())); + oneTest(offBegin, offEnd, status); + } + } +} diff --git a/src/test/org/apache/hadoop/fs/TestGlobExpander.java b/src/test/org/apache/hadoop/fs/TestGlobExpander.java new file mode 100644 index 00000000000..b0466b80229 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestGlobExpander.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import java.io.IOException; +import java.util.List; + +import junit.framework.TestCase; + +public class TestGlobExpander extends TestCase { + + public void testExpansionIsIdentical() throws IOException { + checkExpansionIsIdentical(""); + checkExpansionIsIdentical("/}"); + checkExpansionIsIdentical("/}{a,b}"); + checkExpansionIsIdentical("{/"); + checkExpansionIsIdentical("{a}"); + checkExpansionIsIdentical("{a,b}/{b,c}"); + checkExpansionIsIdentical("p\\{a/b,c/d\\}s"); + checkExpansionIsIdentical("p{a\\/b,c\\/d}s"); + } + + public void testExpansion() throws IOException { + checkExpansion("{a/b}", "a/b"); + checkExpansion("/}{a/b}", "/}a/b"); + checkExpansion("p{a/b,c/d}s", "pa/bs", "pc/ds"); + checkExpansion("{a/b,c/d,{e,f}}", "a/b", "c/d", "{e,f}"); + checkExpansion("{a/b,c/d}{e,f}", "a/b{e,f}", "c/d{e,f}"); + checkExpansion("{a,b}/{b,{c/d,e/f}}", "{a,b}/b", "{a,b}/c/d", "{a,b}/e/f"); + checkExpansion("{a,b}/{c/\\d}", "{a,b}/c/d"); + } + + private void checkExpansionIsIdentical(String filePattern) throws IOException { + checkExpansion(filePattern, filePattern); + } + + private void checkExpansion(String filePattern, String... expectedExpansions) + throws IOException { + List actualExpansions = GlobExpander.expand(filePattern); + assertEquals("Different number of expansions", expectedExpansions.length, + actualExpansions.size()); + for (int i = 0; i < expectedExpansions.length; i++) { + assertEquals("Expansion of " + filePattern, expectedExpansions[i], + actualExpansions.get(i)); + } + } +} diff --git a/src/test/org/apache/hadoop/fs/TestLocalDirAllocator.java b/src/test/org/apache/hadoop/fs/TestLocalDirAllocator.java new file mode 100644 index 00000000000..eef90308aa9 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestLocalDirAllocator.java @@ -0,0 +1,211 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import java.io.File; +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Shell; + +import junit.framework.TestCase; + +/** This test LocalDirAllocator works correctly; + * Every test case uses different buffer dirs to + * enforce the AllocatorPerContext initialization. + * This test does not run on Cygwin because under Cygwin + * a directory can be created in a read-only directory + * which breaks this test. + */ +public class TestLocalDirAllocator extends TestCase { + final static private Configuration conf = new Configuration(); + final static private String BUFFER_DIR_ROOT = "build/test/temp"; + final static private Path BUFFER_PATH_ROOT = new Path(BUFFER_DIR_ROOT); + final static private File BUFFER_ROOT = new File(BUFFER_DIR_ROOT); + final static private String BUFFER_DIR[] = new String[] { + BUFFER_DIR_ROOT+"/tmp0", BUFFER_DIR_ROOT+"/tmp1", BUFFER_DIR_ROOT+"/tmp2", + BUFFER_DIR_ROOT+"/tmp3", BUFFER_DIR_ROOT+"/tmp4", BUFFER_DIR_ROOT+"/tmp5", + BUFFER_DIR_ROOT+"/tmp6"}; + final static private Path BUFFER_PATH[] = new Path[] { + new Path(BUFFER_DIR[0]), new Path(BUFFER_DIR[1]), new Path(BUFFER_DIR[2]), + new Path(BUFFER_DIR[3]), new Path(BUFFER_DIR[4]), new Path(BUFFER_DIR[5]), + new Path(BUFFER_DIR[6])}; + final static private String CONTEXT = "dfs.client.buffer.dir"; + final static private String FILENAME = "block"; + final static private LocalDirAllocator dirAllocator = + new LocalDirAllocator(CONTEXT); + static LocalFileSystem localFs; + final static private boolean isWindows = + System.getProperty("os.name").startsWith("Windows"); + final static int SMALL_FILE_SIZE = 100; + static { + try { + localFs = FileSystem.getLocal(conf); + rmBufferDirs(); + } catch(IOException e) { + System.out.println(e.getMessage()); + e.printStackTrace(); + System.exit(-1); + } + } + + private static void rmBufferDirs() throws IOException { + assertTrue(!localFs.exists(BUFFER_PATH_ROOT) || + localFs.delete(BUFFER_PATH_ROOT, true)); + } + + private void validateTempDirCreation(int i) throws IOException { + File result = createTempFile(SMALL_FILE_SIZE); + assertTrue("Checking for " + BUFFER_DIR[i] + " in " + result + " - FAILED!", + result.getPath().startsWith(new File(BUFFER_DIR[i], FILENAME).getPath())); + } + + private File createTempFile() throws IOException { + File result = dirAllocator.createTmpFileForWrite(FILENAME, -1, conf); + result.delete(); + return result; + } + + private File createTempFile(long size) throws IOException { + File result = dirAllocator.createTmpFileForWrite(FILENAME, size, conf); + result.delete(); + return result; + } + + /** Two buffer dirs. The first dir does not exist & is on a read-only disk; + * The second dir exists & is RW + * @throws Exception + */ + public void test0() throws Exception { + if (isWindows) return; + try { + conf.set(CONTEXT, BUFFER_DIR[0]+","+BUFFER_DIR[1]); + assertTrue(localFs.mkdirs(BUFFER_PATH[1])); + BUFFER_ROOT.setReadOnly(); + validateTempDirCreation(1); + validateTempDirCreation(1); + } finally { + Shell.execCommand(new String[]{"chmod", "u+w", BUFFER_DIR_ROOT}); + rmBufferDirs(); + } + } + + /** Two buffer dirs. The first dir exists & is on a read-only disk; + * The second dir exists & is RW + * @throws Exception + */ + public void test1() throws Exception { + if (isWindows) return; + try { + conf.set(CONTEXT, BUFFER_DIR[1]+","+BUFFER_DIR[2]); + assertTrue(localFs.mkdirs(BUFFER_PATH[2])); + BUFFER_ROOT.setReadOnly(); + validateTempDirCreation(2); + validateTempDirCreation(2); + } finally { + Shell.execCommand(new String[]{"chmod", "u+w", BUFFER_DIR_ROOT}); + rmBufferDirs(); + } + } + /** Two buffer dirs. Both do not exist but on a RW disk. + * Check if tmp dirs are allocated in a round-robin + */ + public void test2() throws Exception { + if (isWindows) return; + try { + conf.set(CONTEXT, BUFFER_DIR[2]+","+BUFFER_DIR[3]); + + // create the first file, and then figure the round-robin sequence + createTempFile(SMALL_FILE_SIZE); + int firstDirIdx = (dirAllocator.getCurrentDirectoryIndex() == 0) ? 2 : 3; + int secondDirIdx = (firstDirIdx == 2) ? 3 : 2; + + // check if tmp dirs are allocated in a round-robin manner + validateTempDirCreation(firstDirIdx); + validateTempDirCreation(secondDirIdx); + validateTempDirCreation(firstDirIdx); + } finally { + rmBufferDirs(); + } + } + + /** Two buffer dirs. Both exists and on a R/W disk. + * Later disk1 becomes read-only. + * @throws Exception + */ + public void test3() throws Exception { + if (isWindows) return; + try { + conf.set(CONTEXT, BUFFER_DIR[3]+","+BUFFER_DIR[4]); + assertTrue(localFs.mkdirs(BUFFER_PATH[3])); + assertTrue(localFs.mkdirs(BUFFER_PATH[4])); + + // create the first file with size, and then figure the round-robin sequence + createTempFile(SMALL_FILE_SIZE); + + int nextDirIdx = (dirAllocator.getCurrentDirectoryIndex() == 0) ? 3 : 4; + validateTempDirCreation(nextDirIdx); + + // change buffer directory 2 to be read only + new File(BUFFER_DIR[4]).setReadOnly(); + validateTempDirCreation(3); + validateTempDirCreation(3); + } finally { + rmBufferDirs(); + } + } + + /** + * Two buffer dirs, on read-write disk. + * + * Try to create a whole bunch of files. + * Verify that they do indeed all get created where they should. + * + * Would ideally check statistical properties of distribution, but + * we don't have the nerve to risk false-positives here. + * + * @throws Exception + */ + static final int TRIALS = 100; + public void test4() throws Exception { + if (isWindows) return; + try { + + conf.set(CONTEXT, BUFFER_DIR[5]+","+BUFFER_DIR[6]); + assertTrue(localFs.mkdirs(BUFFER_PATH[5])); + assertTrue(localFs.mkdirs(BUFFER_PATH[6])); + + int inDir5=0, inDir6=0; + for(int i = 0; i < TRIALS; ++i) { + File result = createTempFile(); + if(result.getPath().startsWith(new File(BUFFER_DIR[5], FILENAME).getPath())) { + inDir5++; + } else if(result.getPath().startsWith(new File(BUFFER_DIR[6], FILENAME).getPath())) { + inDir6++; + } + result.delete(); + } + + assertTrue( inDir5 + inDir6 == TRIALS); + + } finally { + rmBufferDirs(); + } + } + +} diff --git a/src/test/org/apache/hadoop/fs/TestLocalFileSystem.java b/src/test/org/apache/hadoop/fs/TestLocalFileSystem.java new file mode 100644 index 00000000000..b244b9b5df4 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestLocalFileSystem.java @@ -0,0 +1,156 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import org.apache.hadoop.conf.Configuration; +import java.io.*; +import junit.framework.*; + +/** + * This class tests the local file system via the FileSystem abstraction. + */ +public class TestLocalFileSystem extends TestCase { + private static String TEST_ROOT_DIR + = System.getProperty("test.build.data","build/test/data/work-dir/localfs"); + + + static void writeFile(FileSystem fs, Path name) throws IOException { + FSDataOutputStream stm = fs.create(name); + stm.writeBytes("42\n"); + stm.close(); + } + + static String readFile(FileSystem fs, Path name) throws IOException { + byte[] b = new byte[1024]; + int offset = 0; + FSDataInputStream in = fs.open(name); + for(int remaining, n; + (remaining = b.length - offset) > 0 && (n = in.read(b, offset, remaining)) != -1; + offset += n); + in.close(); + + String s = new String(b, 0, offset); + System.out.println("s=" + s); + return s; + } + + private void cleanupFile(FileSystem fs, Path name) throws IOException { + assertTrue(fs.exists(name)); + fs.delete(name, true); + assertTrue(!fs.exists(name)); + } + + /** + * Test the capability of setting the working directory. + */ + public void testWorkingDirectory() throws IOException { + Configuration conf = new Configuration(); + FileSystem fileSys = FileSystem.getLocal(conf); + Path origDir = fileSys.getWorkingDirectory(); + Path subdir = new Path(TEST_ROOT_DIR, "new"); + try { + // make sure it doesn't already exist + assertTrue(!fileSys.exists(subdir)); + // make it and check for it + assertTrue(fileSys.mkdirs(subdir)); + assertTrue(fileSys.isDirectory(subdir)); + + fileSys.setWorkingDirectory(subdir); + + // create a directory and check for it + Path dir1 = new Path("dir1"); + assertTrue(fileSys.mkdirs(dir1)); + assertTrue(fileSys.isDirectory(dir1)); + + // delete the directory and make sure it went away + fileSys.delete(dir1, true); + assertTrue(!fileSys.exists(dir1)); + + // create files and manipulate them. + Path file1 = new Path("file1"); + Path file2 = new Path("sub/file2"); + writeFile(fileSys, file1); + fileSys.copyFromLocalFile(file1, file2); + assertTrue(fileSys.exists(file1)); + assertTrue(fileSys.isFile(file1)); + cleanupFile(fileSys, file2); + fileSys.copyToLocalFile(file1, file2); + cleanupFile(fileSys, file2); + + // try a rename + fileSys.rename(file1, file2); + assertTrue(!fileSys.exists(file1)); + assertTrue(fileSys.exists(file2)); + fileSys.rename(file2, file1); + + // try reading a file + InputStream stm = fileSys.open(file1); + byte[] buffer = new byte[3]; + int bytesRead = stm.read(buffer, 0, 3); + assertEquals("42\n", new String(buffer, 0, bytesRead)); + stm.close(); + } finally { + fileSys.setWorkingDirectory(origDir); + fileSys.delete(subdir, true); + } + } + + public void testCopy() throws IOException { + Configuration conf = new Configuration(); + LocalFileSystem fs = FileSystem.getLocal(conf); + Path src = new Path(TEST_ROOT_DIR, "dingo"); + Path dst = new Path(TEST_ROOT_DIR, "yak"); + writeFile(fs, src); + assertTrue(FileUtil.copy(fs, src, fs, dst, true, false, conf)); + assertTrue(!fs.exists(src) && fs.exists(dst)); + assertTrue(FileUtil.copy(fs, dst, fs, src, false, false, conf)); + assertTrue(fs.exists(src) && fs.exists(dst)); + assertTrue(FileUtil.copy(fs, src, fs, dst, true, true, conf)); + assertTrue(!fs.exists(src) && fs.exists(dst)); + fs.mkdirs(src); + assertTrue(FileUtil.copy(fs, dst, fs, src, false, false, conf)); + Path tmp = new Path(src, dst.getName()); + assertTrue(fs.exists(tmp) && fs.exists(dst)); + assertTrue(FileUtil.copy(fs, dst, fs, src, false, true, conf)); + assertTrue(fs.delete(tmp, true)); + fs.mkdirs(tmp); + try { + FileUtil.copy(fs, dst, fs, src, true, true, conf); + fail("Failed to detect existing dir"); + } catch (IOException e) { } + } + + public void testHomeDirectory() throws IOException { + Configuration conf = new Configuration(); + FileSystem fileSys = FileSystem.getLocal(conf); + Path home = new Path(System.getProperty("user.home")) + .makeQualified(fileSys); + Path fsHome = fileSys.getHomeDirectory(); + assertEquals(home, fsHome); + } + + public void testPathEscapes() throws IOException { + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.getLocal(conf); + Path path = new Path(TEST_ROOT_DIR, "foo%bar"); + writeFile(fs, path); + FileStatus status = fs.getFileStatus(path); + assertEquals(path.makeQualified(fs), status.getPath()); + cleanupFile(fs, path); + } +} diff --git a/src/test/org/apache/hadoop/fs/TestLocalFileSystemPermission.java b/src/test/org/apache/hadoop/fs/TestLocalFileSystemPermission.java new file mode 100644 index 00000000000..f68cdb66cdf --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestLocalFileSystemPermission.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.permission.*; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Shell; + +import java.io.*; +import java.util.*; + +import junit.framework.*; + +/** + * This class tests the local file system via the FileSystem abstraction. + */ +public class TestLocalFileSystemPermission extends TestCase { + static final String TEST_PATH_PREFIX = new Path(System.getProperty( + "test.build.data", "/tmp")).toString().replace(' ', '_') + + "/" + TestLocalFileSystemPermission.class.getSimpleName() + "_"; + + { + try { + ((org.apache.commons.logging.impl.Log4JLogger)FileSystem.LOG).getLogger() + .setLevel(org.apache.log4j.Level.DEBUG); + } + catch(Exception e) { + System.out.println("Cannot change log level\n" + + StringUtils.stringifyException(e)); + } + } + + private Path writeFile(FileSystem fs, String name) throws IOException { + Path f = new Path(TEST_PATH_PREFIX + name); + FSDataOutputStream stm = fs.create(f); + stm.writeBytes("42\n"); + stm.close(); + return f; + } + + private void cleanupFile(FileSystem fs, Path name) throws IOException { + assertTrue(fs.exists(name)); + fs.delete(name, true); + assertTrue(!fs.exists(name)); + } + + /** Test LocalFileSystem.setPermission */ + public void testLocalFSsetPermission() throws IOException { + if (Path.WINDOWS) { + System.out.println("Cannot run test for Windows"); + return; + } + Configuration conf = new Configuration(); + LocalFileSystem localfs = FileSystem.getLocal(conf); + String filename = "foo"; + Path f = writeFile(localfs, filename); + try { + System.out.println(filename + ": " + getPermission(localfs, f)); + } + catch(Exception e) { + System.out.println(StringUtils.stringifyException(e)); + System.out.println("Cannot run test"); + return; + } + + try { + // create files and manipulate them. + FsPermission all = new FsPermission((short)0777); + FsPermission none = new FsPermission((short)0); + + localfs.setPermission(f, none); + assertEquals(none, getPermission(localfs, f)); + + localfs.setPermission(f, all); + assertEquals(all, getPermission(localfs, f)); + } + finally {cleanupFile(localfs, f);} + } + + FsPermission getPermission(LocalFileSystem fs, Path p) throws IOException { + return fs.getFileStatus(p).getPermission(); + } + + /** Test LocalFileSystem.setOwner */ + public void testLocalFSsetOwner() throws IOException { + if (Path.WINDOWS) { + System.out.println("Cannot run test for Windows"); + return; + } + + Configuration conf = new Configuration(); + LocalFileSystem localfs = FileSystem.getLocal(conf); + String filename = "bar"; + Path f = writeFile(localfs, filename); + List groups = null; + try { + groups = getGroups(); + System.out.println(filename + ": " + getPermission(localfs, f)); + } + catch(IOException e) { + System.out.println(StringUtils.stringifyException(e)); + System.out.println("Cannot run test"); + return; + } + if (groups == null || groups.size() < 1) { + System.out.println("Cannot run test: need at least one group. groups=" + + groups); + return; + } + + // create files and manipulate them. + try { + String g0 = groups.get(0); + localfs.setOwner(f, null, g0); + assertEquals(g0, getGroup(localfs, f)); + + if (groups.size() > 1) { + String g1 = groups.get(1); + localfs.setOwner(f, null, g1); + assertEquals(g1, getGroup(localfs, f)); + } else { + System.out.println("Not testing changing the group since user " + + "belongs to only one group."); + } + } + finally {cleanupFile(localfs, f);} + } + + static List getGroups() throws IOException { + List a = new ArrayList(); + String s = Shell.execCommand(Shell.getGROUPS_COMMAND()); + for(StringTokenizer t = new StringTokenizer(s); t.hasMoreTokens(); ) { + a.add(t.nextToken()); + } + return a; + } + + String getGroup(LocalFileSystem fs, Path p) throws IOException { + return fs.getFileStatus(p).getGroup(); + } +} diff --git a/src/test/org/apache/hadoop/fs/TestPath.java b/src/test/org/apache/hadoop/fs/TestPath.java new file mode 100644 index 00000000000..4fa28bc77ce --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestPath.java @@ -0,0 +1,152 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import java.util.*; +import junit.framework.TestCase; + +public class TestPath extends TestCase { + public void testToString() { + toStringTest("/"); + toStringTest("/foo"); + toStringTest("/foo/bar"); + toStringTest("foo"); + toStringTest("foo/bar"); + boolean emptyException = false; + try { + toStringTest(""); + } catch (IllegalArgumentException e) { + // expect to receive an IllegalArgumentException + emptyException = true; + } + assertTrue(emptyException); + if (Path.WINDOWS) { + toStringTest("c:"); + toStringTest("c:/"); + toStringTest("c:foo"); + toStringTest("c:foo/bar"); + toStringTest("c:foo/bar"); + toStringTest("c:/foo/bar"); + } + } + + private void toStringTest(String pathString) { + assertEquals(pathString, new Path(pathString).toString()); + } + + public void testNormalize() { + assertEquals("/", new Path("//").toString()); + assertEquals("/foo", new Path("/foo/").toString()); + assertEquals("/foo", new Path("/foo/").toString()); + assertEquals("foo", new Path("foo/").toString()); + assertEquals("foo", new Path("foo//").toString()); + assertEquals("foo/bar", new Path("foo//bar").toString()); + if (Path.WINDOWS) { + assertEquals("c:/a/b", new Path("c:\\a\\b").toString()); + } + } + + public void testIsAbsolute() { + assertTrue(new Path("/").isAbsolute()); + assertTrue(new Path("/foo").isAbsolute()); + assertFalse(new Path("foo").isAbsolute()); + assertFalse(new Path("foo/bar").isAbsolute()); + assertFalse(new Path(".").isAbsolute()); + if (Path.WINDOWS) { + assertTrue(new Path("c:/a/b").isAbsolute()); + assertFalse(new Path("c:a/b").isAbsolute()); + } + } + + public void testParent() { + assertEquals(new Path("/foo"), new Path("/foo/bar").getParent()); + assertEquals(new Path("foo"), new Path("foo/bar").getParent()); + assertEquals(new Path("/"), new Path("/foo").getParent()); + if (Path.WINDOWS) { + assertEquals(new Path("c:/"), new Path("c:/foo").getParent()); + } + } + + public void testChild() { + assertEquals(new Path("."), new Path(".", ".")); + assertEquals(new Path("/"), new Path("/", ".")); + assertEquals(new Path("/"), new Path(".", "/")); + assertEquals(new Path("/foo"), new Path("/", "foo")); + assertEquals(new Path("/foo/bar"), new Path("/foo", "bar")); + assertEquals(new Path("/foo/bar/baz"), new Path("/foo/bar", "baz")); + assertEquals(new Path("/foo/bar/baz"), new Path("/foo", "bar/baz")); + assertEquals(new Path("foo"), new Path(".", "foo")); + assertEquals(new Path("foo/bar"), new Path("foo", "bar")); + assertEquals(new Path("foo/bar/baz"), new Path("foo", "bar/baz")); + assertEquals(new Path("foo/bar/baz"), new Path("foo/bar", "baz")); + assertEquals(new Path("/foo"), new Path("/bar", "/foo")); + if (Path.WINDOWS) { + assertEquals(new Path("c:/foo"), new Path("/bar", "c:/foo")); + assertEquals(new Path("c:/foo"), new Path("d:/bar", "c:/foo")); + } + } + + public void testEquals() { + assertFalse(new Path("/").equals(new Path("/foo"))); + } + + public void testDots() { + // Test Path(String) + assertEquals(new Path("/foo/bar/baz").toString(), "/foo/bar/baz"); + assertEquals(new Path("/foo/bar", ".").toString(), "/foo/bar"); + assertEquals(new Path("/foo/bar/../baz").toString(), "/foo/baz"); + assertEquals(new Path("/foo/bar/./baz").toString(), "/foo/bar/baz"); + assertEquals(new Path("/foo/bar/baz/../../fud").toString(), "/foo/fud"); + assertEquals(new Path("/foo/bar/baz/.././../fud").toString(), "/foo/fud"); + assertEquals(new Path("../../foo/bar").toString(), "../../foo/bar"); + assertEquals(new Path(".././../foo/bar").toString(), "../../foo/bar"); + assertEquals(new Path("./foo/bar/baz").toString(), "foo/bar/baz"); + assertEquals(new Path("/foo/bar/../../baz/boo").toString(), "/baz/boo"); + assertEquals(new Path("foo/bar/").toString(), "foo/bar"); + assertEquals(new Path("foo/bar/../baz").toString(), "foo/baz"); + assertEquals(new Path("foo/bar/../../baz/boo").toString(), "baz/boo"); + + + // Test Path(Path,Path) + assertEquals(new Path("/foo/bar", "baz/boo").toString(), "/foo/bar/baz/boo"); + assertEquals(new Path("foo/bar/","baz/bud").toString(), "foo/bar/baz/bud"); + + assertEquals(new Path("/foo/bar","../../boo/bud").toString(), "/boo/bud"); + assertEquals(new Path("foo/bar","../../boo/bud").toString(), "boo/bud"); + assertEquals(new Path(".","boo/bud").toString(), "boo/bud"); + + assertEquals(new Path("/foo/bar/baz","../../boo/bud").toString(), "/foo/boo/bud"); + assertEquals(new Path("foo/bar/baz","../../boo/bud").toString(), "foo/boo/bud"); + + + assertEquals(new Path("../../","../../boo/bud").toString(), "../../../../boo/bud"); + assertEquals(new Path("../../foo","../../../boo/bud").toString(), "../../../../boo/bud"); + assertEquals(new Path("../../foo/bar","../boo/bud").toString(), "../../foo/boo/bud"); + + assertEquals(new Path("foo/bar/baz","../../..").toString(), ""); + assertEquals(new Path("foo/bar/baz","../../../../..").toString(), "../.."); + } + + public void testScheme() throws java.io.IOException { + assertEquals("foo:/bar", new Path("foo:/","/bar").toString()); + assertEquals("foo://bar/baz", new Path("foo://bar/","/baz").toString()); + } + + +} diff --git a/src/test/org/apache/hadoop/fs/TestTrash.java b/src/test/org/apache/hadoop/fs/TestTrash.java new file mode 100644 index 00000000000..cff1f2419b7 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestTrash.java @@ -0,0 +1,313 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + + +import junit.framework.TestCase; +import java.io.File; +import java.io.IOException; +import java.io.DataOutputStream; +import java.net.URI; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FsShell; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.Trash; +import org.apache.hadoop.fs.LocalFileSystem; + +/** + * This class tests commands from Trash. + */ +public class TestTrash extends TestCase { + + private final static Path TEST_DIR = + new Path(new File(System.getProperty("test.build.data","/tmp") + ).toURI().toString().replace(' ', '+'), "testTrash"); + + protected static Path writeFile(FileSystem fs, Path f) throws IOException { + DataOutputStream out = fs.create(f); + out.writeBytes("dhruba: " + f); + out.close(); + assertTrue(fs.exists(f)); + return f; + } + + protected static Path mkdir(FileSystem fs, Path p) throws IOException { + assertTrue(fs.mkdirs(p)); + assertTrue(fs.exists(p)); + assertTrue(fs.getFileStatus(p).isDir()); + return p; + } + + // check that the specified file is in Trash + protected static void checkTrash(FileSystem fs, Path trashRoot, + Path path) throws IOException { + Path p = new Path(trashRoot+"/"+ path.toUri().getPath()); + assertTrue(fs.exists(p)); + } + + // check that the specified file is not in Trash + static void checkNotInTrash(FileSystem fs, Path trashRoot, String pathname) + throws IOException { + Path p = new Path(trashRoot+"/"+ new Path(pathname).getName()); + assertTrue(!fs.exists(p)); + } + + protected static void trashShell(final FileSystem fs, final Path base) + throws IOException { + Configuration conf = new Configuration(); + conf.set("fs.trash.interval", "10"); // 10 minute + conf.set("fs.default.name", fs.getUri().toString()); + FsShell shell = new FsShell(); + shell.setConf(conf); + Path trashRoot = null; + + // First create a new directory with mkdirs + Path myPath = new Path(base, "test/mkdirs"); + mkdir(fs, myPath); + + // Second, create a file in that directory. + Path myFile = new Path(base, "test/mkdirs/myFile"); + writeFile(fs, myFile); + + // Verify that expunge without Trash directory + // won't throw Exception + { + String[] args = new String[1]; + args[0] = "-expunge"; + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + } + + // Verify that we succeed in removing the file we created. + // This should go into Trash. + { + String[] args = new String[2]; + args[0] = "-rm"; + args[1] = myFile.toString(); + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + + trashRoot = shell.getCurrentTrashDir(); + checkTrash(fs, trashRoot, myFile); + } + + // Verify that we can recreate the file + writeFile(fs, myFile); + + // Verify that we succeed in removing the file we re-created + { + String[] args = new String[2]; + args[0] = "-rm"; + args[1] = new Path(base, "test/mkdirs/myFile").toString(); + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + } + + // Verify that we can recreate the file + writeFile(fs, myFile); + + // Verify that we succeed in removing the whole directory + // along with the file inside it. + { + String[] args = new String[2]; + args[0] = "-rmr"; + args[1] = new Path(base, "test/mkdirs").toString(); + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + } + + // recreate directory + mkdir(fs, myPath); + + // Verify that we succeed in removing the whole directory + { + String[] args = new String[2]; + args[0] = "-rmr"; + args[1] = new Path(base, "test/mkdirs").toString(); + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + } + + // Check that we can delete a file from the trash + { + Path toErase = new Path(trashRoot, "toErase"); + int retVal = -1; + writeFile(fs, toErase); + try { + retVal = shell.run(new String[] {"-rm", toErase.toString()}); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(retVal == 0); + checkNotInTrash (fs, trashRoot, toErase.toString()); + checkNotInTrash (fs, trashRoot, toErase.toString()+".1"); + } + + // simulate Trash removal + { + String[] args = new String[1]; + args[0] = "-expunge"; + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + } + + // verify that after expunging the Trash, it really goes away + checkNotInTrash(fs, trashRoot, new Path(base, "test/mkdirs/myFile").toString()); + + // recreate directory and file + mkdir(fs, myPath); + writeFile(fs, myFile); + + // remove file first, then remove directory + { + String[] args = new String[2]; + args[0] = "-rm"; + args[1] = myFile.toString(); + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + checkTrash(fs, trashRoot, myFile); + + args = new String[2]; + args[0] = "-rmr"; + args[1] = myPath.toString(); + val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == 0); + checkTrash(fs, trashRoot, myPath); + } + + // attempt to remove parent of trash + { + String[] args = new String[2]; + args[0] = "-rmr"; + args[1] = trashRoot.getParent().getParent().toString(); + int val = -1; + try { + val = shell.run(args); + } catch (Exception e) { + System.err.println("Exception raised from Trash.run " + + e.getLocalizedMessage()); + } + assertTrue(val == -1); + assertTrue(fs.exists(trashRoot)); + } + } + + public static void trashNonDefaultFS(Configuration conf) throws IOException { + conf.set("fs.trash.interval", "10"); // 10 minute + // attempt non-default FileSystem trash + { + final FileSystem lfs = FileSystem.getLocal(conf); + Path p = TEST_DIR; + Path f = new Path(p, "foo/bar"); + if (lfs.exists(p)) { + lfs.delete(p, true); + } + try { + f = writeFile(lfs, f); + + FileSystem.closeAll(); + FileSystem localFs = FileSystem.get(URI.create("file:///"), conf); + Trash lTrash = new Trash(localFs, conf); + lTrash.moveToTrash(f.getParent()); + checkTrash(localFs, lTrash.getCurrentTrashDir(), f); + } finally { + if (lfs.exists(p)) { + lfs.delete(p, true); + } + } + } + } + + public void testTrash() throws IOException { + Configuration conf = new Configuration(); + conf.setClass("fs.file.impl", TestLFS.class, FileSystem.class); + trashShell(FileSystem.getLocal(conf), TEST_DIR); + } + + public void testNonDefaultFS() throws IOException { + Configuration conf = new Configuration(); + conf.setClass("fs.file.impl", TestLFS.class, FileSystem.class); + conf.set("fs.default.name", "invalid://host/bar/foo"); + trashNonDefaultFS(conf); + } + + static class TestLFS extends LocalFileSystem { + Path home; + TestLFS() { + this(TEST_DIR); + } + TestLFS(Path home) { + super(); + this.home = home; + } + public Path getHomeDirectory() { + return home; + } + } +} diff --git a/src/test/org/apache/hadoop/fs/TestTruncatedInputBug.java b/src/test/org/apache/hadoop/fs/TestTruncatedInputBug.java new file mode 100644 index 00000000000..e7dabf903cd --- /dev/null +++ b/src/test/org/apache/hadoop/fs/TestTruncatedInputBug.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import java.io.DataOutputStream; +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; + +/** + * test for the input truncation bug when mark/reset is used. + * HADOOP-1489 + */ +public class TestTruncatedInputBug extends TestCase { + private static String TEST_ROOT_DIR = + new Path(System.getProperty("test.build.data","/tmp")) + .toString().replace(' ', '+'); + + private void writeFile(FileSystem fileSys, + Path name, int nBytesToWrite) + throws IOException { + DataOutputStream out = fileSys.create(name); + for (int i = 0; i < nBytesToWrite; ++i) { + out.writeByte(0); + } + out.close(); + } + + /** + * When mark() is used on BufferedInputStream, the request + * size on the checksum file system can be small. However, + * checksum file system currently depends on the request size + * >= bytesPerSum to work properly. + */ + public void testTruncatedInputBug() throws IOException { + final int ioBufSize = 512; + final int fileSize = ioBufSize*4; + int filePos = 0; + + Configuration conf = new Configuration(); + conf.setInt("io.file.buffer.size", ioBufSize); + FileSystem fileSys = FileSystem.getLocal(conf); + + try { + // First create a test input file. + Path testFile = new Path(TEST_ROOT_DIR, "HADOOP-1489"); + writeFile(fileSys, testFile, fileSize); + assertTrue(fileSys.exists(testFile)); + assertTrue(fileSys.getFileStatus(testFile).getLen() == fileSize); + + // Now read the file for ioBufSize bytes + FSDataInputStream in = fileSys.open(testFile, ioBufSize); + // seek beyond data buffered by open + filePos += ioBufSize * 2 + (ioBufSize - 10); + in.seek(filePos); + + // read 4 more bytes before marking + for (int i = 0; i < 4; ++i) { + if (in.read() == -1) { + break; + } + ++filePos; + } + + // Now set mark() to trigger the bug + // NOTE: in the fixed code, mark() does nothing (not supported) and + // hence won't trigger this bug. + in.mark(1); + System.out.println("MARKED"); + + // Try to read the rest + while (filePos < fileSize) { + if (in.read() == -1) { + break; + } + ++filePos; + } + in.close(); + + System.out.println("Read " + filePos + " bytes." + + " file size=" + fileSize); + assertTrue(filePos == fileSize); + + } finally { + try { + fileSys.close(); + } catch (Exception e) { + // noop + } + } + } // end testTruncatedInputBug +} diff --git a/src/test/org/apache/hadoop/fs/kfs/KFSEmulationImpl.java b/src/test/org/apache/hadoop/fs/kfs/KFSEmulationImpl.java new file mode 100644 index 00000000000..9c7b5bafef4 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/kfs/KFSEmulationImpl.java @@ -0,0 +1,150 @@ +/** + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * @author: Sriram Rao (Kosmix Corp.) + * + * We need to provide the ability to the code in fs/kfs without really + * having a KFS deployment. For this purpose, use the LocalFileSystem + * as a way to "emulate" KFS. + */ + +package org.apache.hadoop.fs.kfs; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.Progressable; + +public class KFSEmulationImpl implements IFSImpl { + FileSystem localFS; + + public KFSEmulationImpl(Configuration conf) throws IOException { + localFS = FileSystem.getLocal(conf); + } + + public boolean exists(String path) throws IOException { + return localFS.exists(new Path(path)); + } + public boolean isDirectory(String path) throws IOException { + return localFS.isDirectory(new Path(path)); + } + public boolean isFile(String path) throws IOException { + return localFS.isFile(new Path(path)); + } + + public String[] readdir(String path) throws IOException { + FileStatus[] p = localFS.listStatus(new Path(path)); + String[] entries = null; + + if (p == null) { + return null; + } + + entries = new String[p.length]; + for (int i = 0; i < p.length; i++) + entries[i] = p[i].getPath().toString(); + return entries; + } + + public FileStatus[] readdirplus(Path path) throws IOException { + return localFS.listStatus(path); + } + + public int mkdirs(String path) throws IOException { + if (localFS.mkdirs(new Path(path))) + return 0; + + return -1; + } + + public int rename(String source, String dest) throws IOException { + if (localFS.rename(new Path(source), new Path(dest))) + return 0; + return -1; + } + + public int rmdir(String path) throws IOException { + if (isDirectory(path)) { + // the directory better be empty + String[] dirEntries = readdir(path); + if ((dirEntries.length <= 2) && (localFS.delete(new Path(path), true))) + return 0; + } + return -1; + } + + public int remove(String path) throws IOException { + if (isFile(path) && (localFS.delete(new Path(path), true))) + return 0; + return -1; + } + + public long filesize(String path) throws IOException { + return localFS.getFileStatus(new Path(path)).getLen(); + } + public short getReplication(String path) throws IOException { + return 1; + } + public short setReplication(String path, short replication) throws IOException { + return 1; + } + public String[][] getDataLocation(String path, long start, long len) throws IOException { + BlockLocation[] blkLocations = + localFS.getFileBlockLocations(localFS.getFileStatus(new Path(path)), + start, len); + if ((blkLocations == null) || (blkLocations.length == 0)) { + return new String[0][]; + } + int blkCount = blkLocations.length; + String[][]hints = new String[blkCount][]; + for (int i=0; i < blkCount ; i++) { + String[] hosts = blkLocations[i].getHosts(); + hints[i] = new String[hosts.length]; + hints[i] = hosts; + } + return hints; + } + + public long getModificationTime(String path) throws IOException { + FileStatus s = localFS.getFileStatus(new Path(path)); + if (s == null) + return 0; + + return s.getModificationTime(); + } + + public FSDataOutputStream append(String path, int bufferSize, Progressable progress) throws IOException { + // besides path/overwrite, the other args don't matter for + // testing purposes. + return localFS.append(new Path(path)); + } + + public FSDataOutputStream create(String path, short replication, int bufferSize, Progressable progress) throws IOException { + // besides path/overwrite, the other args don't matter for + // testing purposes. + return localFS.create(new Path(path)); + } + + public FSDataInputStream open(String path, int bufferSize) throws IOException { + return localFS.open(new Path(path)); + } + + +}; diff --git a/src/test/org/apache/hadoop/fs/kfs/TestKosmosFileSystem.java b/src/test/org/apache/hadoop/fs/kfs/TestKosmosFileSystem.java new file mode 100644 index 00000000000..c853f2af3f3 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/kfs/TestKosmosFileSystem.java @@ -0,0 +1,204 @@ +/** + * + * Licensed under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * @author: Sriram Rao (Kosmix Corp.) + * + * Unit tests for testing the KosmosFileSystem API implementation. + */ + +package org.apache.hadoop.fs.kfs; + +import java.io.*; +import java.net.*; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.Path; + +import org.apache.hadoop.fs.kfs.KosmosFileSystem; + +public class TestKosmosFileSystem extends TestCase { + + KosmosFileSystem kosmosFileSystem; + KFSEmulationImpl kfsEmul; + Path baseDir; + + @Override + protected void setUp() throws IOException { + Configuration conf = new Configuration(); + + kfsEmul = new KFSEmulationImpl(conf); + kosmosFileSystem = new KosmosFileSystem(kfsEmul); + // a dummy URI; we are not connecting to any setup here + kosmosFileSystem.initialize(URI.create("kfs:///"), conf); + baseDir = new Path(System.getProperty("test.build.data", "/tmp" ) + + "/kfs-test"); + } + + @Override + protected void tearDown() throws Exception { + + } + + // @Test + // Check all the directory API's in KFS + public void testDirs() throws Exception { + Path subDir1 = new Path("dir.1"); + + // make the dir + kosmosFileSystem.mkdirs(baseDir); + assertTrue(kosmosFileSystem.isDirectory(baseDir)); + kosmosFileSystem.setWorkingDirectory(baseDir); + + kosmosFileSystem.mkdirs(subDir1); + assertTrue(kosmosFileSystem.isDirectory(subDir1)); + + assertFalse(kosmosFileSystem.exists(new Path("test1"))); + assertFalse(kosmosFileSystem.isDirectory(new Path("test/dir.2"))); + + FileStatus[] p = kosmosFileSystem.listStatus(baseDir); + assertEquals(p.length, 1); + + kosmosFileSystem.delete(baseDir, true); + assertFalse(kosmosFileSystem.exists(baseDir)); + } + + // @Test + // Check the file API's + public void testFiles() throws Exception { + Path subDir1 = new Path("dir.1"); + Path file1 = new Path("dir.1/foo.1"); + Path file2 = new Path("dir.1/foo.2"); + + kosmosFileSystem.mkdirs(baseDir); + assertTrue(kosmosFileSystem.isDirectory(baseDir)); + kosmosFileSystem.setWorkingDirectory(baseDir); + + kosmosFileSystem.mkdirs(subDir1); + + FSDataOutputStream s1 = kosmosFileSystem.create(file1, true, 4096, (short) 1, (long) 4096, null); + FSDataOutputStream s2 = kosmosFileSystem.create(file2, true, 4096, (short) 1, (long) 4096, null); + + s1.close(); + s2.close(); + + FileStatus[] p = kosmosFileSystem.listStatus(subDir1); + assertEquals(p.length, 2); + + kosmosFileSystem.delete(file1, true); + p = kosmosFileSystem.listStatus(subDir1); + assertEquals(p.length, 1); + + kosmosFileSystem.delete(file2, true); + p = kosmosFileSystem.listStatus(subDir1); + assertEquals(p.length, 0); + + kosmosFileSystem.delete(baseDir, true); + assertFalse(kosmosFileSystem.exists(baseDir)); + } + + // @Test + // Check file/read write + public void testFileIO() throws Exception { + Path subDir1 = new Path("dir.1"); + Path file1 = new Path("dir.1/foo.1"); + + kosmosFileSystem.mkdirs(baseDir); + assertTrue(kosmosFileSystem.isDirectory(baseDir)); + kosmosFileSystem.setWorkingDirectory(baseDir); + + kosmosFileSystem.mkdirs(subDir1); + + FSDataOutputStream s1 = kosmosFileSystem.create(file1, true, 4096, (short) 1, (long) 4096, null); + + int bufsz = 4096; + byte[] data = new byte[bufsz]; + + for (int i = 0; i < data.length; i++) + data[i] = (byte) (i % 16); + + // write 4 bytes and read them back; read API should return a byte per call + s1.write(32); + s1.write(32); + s1.write(32); + s1.write(32); + // write some data + s1.write(data, 0, data.length); + // flush out the changes + s1.close(); + + // Read the stuff back and verify it is correct + FSDataInputStream s2 = kosmosFileSystem.open(file1, 4096); + int v; + long nread = 0; + + v = s2.read(); + assertEquals(v, 32); + v = s2.read(); + assertEquals(v, 32); + v = s2.read(); + assertEquals(v, 32); + v = s2.read(); + assertEquals(v, 32); + + assertEquals(s2.available(), data.length); + + byte[] buf = new byte[bufsz]; + s2.read(buf, 0, buf.length); + nread = s2.getPos(); + + for (int i = 0; i < data.length; i++) + assertEquals(data[i], buf[i]); + + assertEquals(s2.available(), 0); + + s2.close(); + + // append some data to the file + try { + s1 = kosmosFileSystem.append(file1); + for (int i = 0; i < data.length; i++) + data[i] = (byte) (i % 17); + // write the data + s1.write(data, 0, data.length); + // flush out the changes + s1.close(); + + // read it back and validate + s2 = kosmosFileSystem.open(file1, 4096); + s2.seek(nread); + s2.read(buf, 0, buf.length); + for (int i = 0; i < data.length; i++) + assertEquals(data[i], buf[i]); + + s2.close(); + } catch (Exception e) { + System.out.println("append isn't supported by the underlying fs"); + } + + kosmosFileSystem.delete(file1, true); + assertFalse(kosmosFileSystem.exists(file1)); + kosmosFileSystem.delete(subDir1, true); + assertFalse(kosmosFileSystem.exists(subDir1)); + kosmosFileSystem.delete(baseDir, true); + assertFalse(kosmosFileSystem.exists(baseDir)); + } + +} diff --git a/src/test/org/apache/hadoop/fs/loadGenerator/DataGenerator.java b/src/test/org/apache/hadoop/fs/loadGenerator/DataGenerator.java new file mode 100644 index 00000000000..4825bbada50 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/loadGenerator/DataGenerator.java @@ -0,0 +1,160 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.loadGenerator; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +/** + * This program reads the directory structure and file structure from + * the input directory and creates the namespace in the file system + * specified by the configuration in the specified root. + * All the files are filled with 'a'. + * + * The synopsis of the command is + * java DataGenerator + * -inDir : input directory name where directory/file structures + * are stored. Its default value is the current directory. + * -root : the name of the root directory which the new namespace + * is going to be placed under. + * Its default value is "/testLoadSpace". + */ +public class DataGenerator extends Configured implements Tool { + private File inDir = StructureGenerator.DEFAULT_STRUCTURE_DIRECTORY; + private Path root = DEFAULT_ROOT; + private FileSystem fs; + final static private long BLOCK_SIZE = 10; + final static private String USAGE = "java DataGenerator " + + "-inDir " + + "-root "; + + /** default name of the root where the test namespace will be placed under */ + final static Path DEFAULT_ROOT = new Path("/testLoadSpace"); + + /** Main function. + * It first parses the command line arguments. + * It then reads the directory structure from the input directory + * structure file and creates directory structure in the file system + * namespace. Afterwards it reads the file attributes and creates files + * in the file. All file content is filled with 'a'. + */ + public int run(String[] args) throws Exception { + int exitCode = 0; + exitCode = init(args); + if (exitCode != 0) { + return exitCode; + } + genDirStructure(); + genFiles(); + return exitCode; + } + + /** Parse the command line arguments and initialize the data */ + private int init(String[] args) { + try { // initialize file system handle + fs = FileSystem.get(getConf()); + } catch (IOException ioe) { + System.err.println("Can not initialize the file system: " + + ioe.getLocalizedMessage()); + return -1; + } + + for (int i = 0; i < args.length; i++) { // parse command line + if (args[i].equals("-root")) { + root = new Path(args[++i]); + } else if (args[i].equals("-inDir")) { + inDir = new File(args[++i]); + } else { + System.err.println(USAGE); + ToolRunner.printGenericCommandUsage(System.err); + System.exit(-1); + } + } + return 0; + } + + /** Read directory structure file under the input directory. + * Create each directory under the specified root. + * The directory names are relative to the specified root. + */ + private void genDirStructure() throws IOException { + BufferedReader in = new BufferedReader( + new FileReader(new File(inDir, + StructureGenerator.DIR_STRUCTURE_FILE_NAME))); + String line; + while ((line=in.readLine()) != null) { + fs.mkdirs(new Path(root+line)); + } + } + + /** Read file structure file under the input directory. + * Create each file under the specified root. + * The file names are relative to the root. + */ + private void genFiles() throws IOException { + BufferedReader in = new BufferedReader( + new FileReader(new File(inDir, + StructureGenerator.FILE_STRUCTURE_FILE_NAME))); + String line; + while ((line=in.readLine()) != null) { + String[] tokens = line.split(" "); + if (tokens.length != 2) { + throw new IOException("Expect at most 2 tokens per line: " + line); + } + String fileName = root+tokens[0]; + long fileSize = (long)(BLOCK_SIZE*Double.parseDouble(tokens[1])); + genFile(new Path(fileName), fileSize); + } + } + + /** Create a file with the name file and + * a length of fileSize. The file is filled with character 'a'. + */ + private void genFile(Path file, long fileSize) throws IOException { + FSDataOutputStream out = fs.create(file, true, + getConf().getInt("io.file.buffer.size", 4096), + (short)getConf().getInt("dfs.replication", 3), + fs.getDefaultBlockSize()); + for(long i=0; i: read probability [0, 1] + * with a default value of 0.3333. + * -writeProbability : write probability [0, 1] + * with a default value of 0.3333. + * -root : test space with a default value of /testLoadSpace + * -maxDelayBetweenOps : + * Max delay in the unit of milliseconds between two operations with a + * default value of 0 indicating no delay. + * -numOfThreads : + * number of threads to spawn with a default value of 200. + * -elapsedTime : + * the elapsed time of program with a default value of 0 + * indicating running forever + * -startTime : when the threads start to run. + * -scriptFile : text file to parse for scripted operation + */ +public class LoadGenerator extends Configured implements Tool { + public static final Log LOG = LogFactory.getLog(LoadGenerator.class); + + private volatile boolean shouldRun = true; + private Path root = DataGenerator.DEFAULT_ROOT; + private FileSystem fs; + private int maxDelayBetweenOps = 0; + private int numOfThreads = 200; + private long [] durations = {0}; + private double [] readProbs = {0.3333}; + private double [] writeProbs = {0.3333}; + private volatile int currentIndex = 0; + long totalTime = 0; + private long startTime = System.currentTimeMillis()+10000; + final static private int BLOCK_SIZE = 10; + private ArrayList files = new ArrayList(); // a table of file names + private ArrayList dirs = new ArrayList(); // a table of directory names + private Random r = null; + final private static String USAGE = "java LoadGenerator\n" + + "-readProbability \n" + + "-writeProbability \n" + + "-root \n" + + "-maxDelayBetweenOps \n" + + "-numOfThreads \n" + + "-elapsedTime \n" + + "-startTime \n" + + "-scriptFile "; + final private String hostname; + + /** Constructor */ + public LoadGenerator() throws IOException, UnknownHostException { + InetAddress addr = InetAddress.getLocalHost(); + hostname = addr.getHostName(); + } + + private final static int OPEN = 0; + private final static int LIST = 1; + private final static int CREATE = 2; + private final static int WRITE_CLOSE = 3; + private final static int DELETE = 4; + private final static int TOTAL_OP_TYPES =5; + private long [] executionTime = new long[TOTAL_OP_TYPES]; + private long [] totalNumOfOps = new long[TOTAL_OP_TYPES]; + + /** A thread sends a stream of requests to the NameNode. + * At each iteration, it first decides if it is going to read a file, + * create a file, or listing a directory following the read + * and write probabilities. + * When reading, it randomly picks a file in the test space and reads + * the entire file. When writing, it randomly picks a directory in the + * test space and creates a file whose name consists of the current + * machine's host name and the thread id. The length of the file + * follows Gaussian distribution with an average size of 2 blocks and + * the standard deviation of 1 block. The new file is filled with 'a'. + * Immediately after the file creation completes, the file is deleted + * from the test space. + * While listing, it randomly picks a directory in the test space and + * list the directory content. + * Between two consecutive operations, the thread pauses for a random + * amount of time in the range of [0, maxDelayBetweenOps] + * if the specified max delay is not zero. + * A thread runs for the specified elapsed time if the time isn't zero. + * Otherwise, it runs forever. + */ + private class DFSClientThread extends Thread { + private int id; + private long [] executionTime = new long[TOTAL_OP_TYPES]; + private long [] totalNumOfOps = new long[TOTAL_OP_TYPES]; + private byte[] buffer = new byte[1024]; + + private DFSClientThread(int id) { + this.id = id; + } + + /** Main loop + * Each iteration decides what's the next operation and then pauses. + */ + public void run() { + try { + while (shouldRun) { + nextOp(); + delay(); + } + } catch (Exception ioe) { + System.err.println(ioe.getLocalizedMessage()); + ioe.printStackTrace(); + } + } + + /** Let the thread pause for a random amount of time in the range of + * [0, maxDelayBetweenOps] if the delay is not zero. Otherwise, no pause. + */ + private void delay() throws InterruptedException { + if (maxDelayBetweenOps>0) { + int delay = r.nextInt(maxDelayBetweenOps); + Thread.sleep(delay); + } + } + + /** Perform the next operation. + * + * Depending on the read and write probabilities, the next + * operation could be either read, write, or list. + */ + private void nextOp() throws IOException { + double rn = r.nextDouble(); + int i = currentIndex; + + if(LOG.isDebugEnabled()) + LOG.debug("Thread " + this.id + " moving to index " + i); + + if (rn < readProbs[i]) { + read(); + } else if (rn < readProbs[i] + writeProbs[i]) { + write(); + } else { + list(); + } + } + + /** Read operation randomly picks a file in the test space and reads + * the entire file */ + private void read() throws IOException { + String fileName = files.get(r.nextInt(files.size())); + long startTime = System.currentTimeMillis(); + InputStream in = fs.open(new Path(fileName)); + executionTime[OPEN] += (System.currentTimeMillis()-startTime); + totalNumOfOps[OPEN]++; + while (in.read(buffer) != -1) {} + in.close(); + } + + /** The write operation randomly picks a directory in the + * test space and creates a file whose name consists of the current + * machine's host name and the thread id. The length of the file + * follows Gaussian distribution with an average size of 2 blocks and + * the standard deviation of 1 block. The new file is filled with 'a'. + * Immediately after the file creation completes, the file is deleted + * from the test space. + */ + private void write() throws IOException { + String dirName = dirs.get(r.nextInt(dirs.size())); + Path file = new Path(dirName, hostname+id); + double fileSize = 0; + while ((fileSize = r.nextGaussian()+2)<=0) {} + genFile(file, (long)(fileSize*BLOCK_SIZE)); + long startTime = System.currentTimeMillis(); + fs.delete(file, true); + executionTime[DELETE] += (System.currentTimeMillis()-startTime); + totalNumOfOps[DELETE]++; + } + + /** The list operation randomly picks a directory in the test space and + * list the directory content. + */ + private void list() throws IOException { + String dirName = dirs.get(r.nextInt(dirs.size())); + long startTime = System.currentTimeMillis(); + fs.listStatus(new Path(dirName)); + executionTime[LIST] += (System.currentTimeMillis()-startTime); + totalNumOfOps[LIST]++; + } + } + + /** Main function: + * It first initializes data by parsing the command line arguments. + * It then starts the number of DFSClient threads as specified by + * the user. + * It stops all the threads when the specified elapsed time is passed. + * Before exiting, it prints the average execution for + * each operation and operation throughput. + */ + public int run(String[] args) throws Exception { + int exitCode = init(args); + if (exitCode != 0) { + return exitCode; + } + + barrier(); + + DFSClientThread[] threads = new DFSClientThread[numOfThreads]; + for (int i=0; i 0) { + while(shouldRun) { + Thread.sleep(durations[currentIndex] * 1000); + totalTime += durations[currentIndex]; + + // Are we on the final line of the script? + if( (currentIndex + 1) == durations.length) { + shouldRun = false; + } else { + if(LOG.isDebugEnabled()) { + LOG.debug("Moving to index " + currentIndex + ": r = " + + readProbs[currentIndex] + ", w = " + writeProbs + + " for duration " + durations[currentIndex]); + } + currentIndex++; + } + } + } + + LOG.debug("Done with testing. Waiting for threads to finish."); + for (DFSClientThread thread : threads) { + thread.join(); + for (int i=0; i 1) { + System.err.println( + "The read probability must be [0, 1]: " + readProbs[0]); + return -1; + } + } else if (args[i].equals("-writeProbability")) { + if(scriptSpecified) { + System.err.println("Can't specify probabilities and use script."); + return -1; + } + writeProbs[0] = Double.parseDouble(args[++i]); + if (writeProbs[0] < 0 || writeProbs[0] > 1) { + System.err.println( + "The write probability must be [0, 1]: " + writeProbs[0]); + return -1; + } + } else if (args[i].equals("-root")) { + root = new Path(args[++i]); + } else if (args[i].equals("-maxDelayBetweenOps")) { + maxDelayBetweenOps = Integer.parseInt(args[++i]); // in milliseconds + } else if (args[i].equals("-numOfThreads")) { + numOfThreads = Integer.parseInt(args[++i]); + if (numOfThreads <= 0) { + System.err.println( + "Number of threads must be positive: " + numOfThreads); + return -1; + } + } else if (args[i].equals("-startTime")) { + startTime = Long.parseLong(args[++i]); + } else if (args[i].equals("-elapsedTime")) { + if(scriptSpecified) { + System.err.println("Can't specify elapsedTime and use script."); + return -1; + } + durations[0] = Long.parseLong(args[++i]); + } else if (args[i].equals("-seed")) { + r = new Random(Long.parseLong(args[++i])+hostHashCode); + } else { + System.err.println(USAGE); + ToolRunner.printGenericCommandUsage(System.err); + return -1; + } + } + } catch (NumberFormatException e) { + System.err.println("Illegal parameter: " + e.getLocalizedMessage()); + System.err.println(USAGE); + return -1; + } + + for(int i = 0; i < readProbs.length; i++) { + if (readProbs[i] + writeProbs[i] <0 || readProbs[i]+ writeProbs[i] > 1) { + System.err.println( + "The sum of read probability and write probability must be [0, 1]: " + + readProbs[i] + " " + writeProbs[i]); + return -1; + } + } + + if (r==null) { + r = new Random(System.currentTimeMillis()+hostHashCode); + } + + return initFileDirTables(); + } + + /** + * Read a script file of the form: lines of text with duration in seconds, + * read probability and write probability, separated by white space. + * + * @param filename Script file + * @return 0 if successful, -1 if not + * @throws IOException if errors with file IO + */ + private int loadScriptFile(String filename) throws IOException { + FileReader fr = new FileReader(new File(filename)); + BufferedReader br = new BufferedReader(fr); + ArrayList duration = new ArrayList(); + ArrayList readProb = new ArrayList(); + ArrayList writeProb = new ArrayList(); + int lineNum = 0; + + String line; + // Read script, parse values, build array of duration, read and write probs + while((line = br.readLine()) != null) { + lineNum++; + if(line.startsWith("#") || line.isEmpty()) // skip comments and blanks + continue; + + String[] a = line.split("\\s"); + if(a.length != 3) { + System.err.println("Line " + lineNum + + ": Incorrect number of parameters: " + line); + } + + try { + long d = Long.parseLong(a[0]); + if(d < 0) { + System.err.println("Line " + lineNum + ": Invalid duration: " + d); + return -1; + } + + double r = Double.parseDouble(a[1]); + if(r < 0.0 || r > 1.0 ) { + System.err.println("Line " + lineNum + + ": The read probability must be [0, 1]: " + r); + return -1; + } + + double w = Double.parseDouble(a[2]); + if(w < 0.0 || w > 1.0) { + System.err.println("Line " + lineNum + + ": The read probability must be [0, 1]: " + r); + return -1; + } + + readProb.add(r); + duration.add(d); + writeProb.add(w); + } catch( NumberFormatException nfe) { + System.err.println(lineNum + ": Can't parse: " + line); + return -1; + } + } + + br.close(); + fr.close(); + + // Copy vectors to arrays of values, to avoid autoboxing overhead later + durations = new long[duration.size()]; + readProbs = new double[readProb.size()]; + writeProbs = new double[writeProb.size()]; + + for(int i = 0; i < durations.length; i++) { + durations[i] = duration.get(i); + readProbs[i] = readProb.get(i); + writeProbs[i] = writeProb.get(i); + } + + if(durations[0] == 0) + System.err.println("Initial duration set to 0. " + + "Will loop until stopped manually."); + + return 0; + } + + /** Create a table that contains all directories under root and + * another table that contains all files under root. + */ + private int initFileDirTables() { + try { + initFileDirTables(root); + } catch (IOException e) { + System.err.println(e.getLocalizedMessage()); + e.printStackTrace(); + return -1; + } + if (dirs.isEmpty()) { + System.err.println("The test space " + root + " is empty"); + return -1; + } + if (files.isEmpty()) { + System.err.println("The test space " + root + + " does not have any file"); + return -1; + } + return 0; + } + + /** Create a table that contains all directories under the specified path and + * another table that contains all files under the specified path and + * whose name starts with "_file_". + */ + private void initFileDirTables(Path path) throws IOException { + FileStatus[] stats = fs.listStatus(path); + if (stats != null) { + for (FileStatus stat : stats) { + if (stat.isDir()) { + dirs.add(stat.getPath().toString()); + initFileDirTables(stat.getPath()); + } else { + Path filePath = stat.getPath(); + if (filePath.getName().startsWith(StructureGenerator.FILE_NAME_PREFIX)) { + files.add(filePath.toString()); + } + } + } + } + } + + /** Returns when the current number of seconds from the epoch equals + * the command line argument given by -startTime. + * This allows multiple instances of this program, running on clock + * synchronized nodes, to start at roughly the same time. + */ + private void barrier() { + long sleepTime; + while ((sleepTime = startTime - System.currentTimeMillis()) > 0) { + try { + Thread.sleep(sleepTime); + } catch (InterruptedException ex) { + } + } + } + + /** Create a file with a length of fileSize. + * The file is filled with 'a'. + */ + private void genFile(Path file, long fileSize) throws IOException { + long startTime = System.currentTimeMillis(); + FSDataOutputStream out = fs.create(file, true, + getConf().getInt("io.file.buffer.size", 4096), + (short)getConf().getInt("dfs.replication", 3), + fs.getDefaultBlockSize()); + executionTime[CREATE] += (System.currentTimeMillis()-startTime); + totalNumOfOps[CREATE]++; + + for (long i=0; i : maximum depth of the directory tree; default is 5. + -minWidth : minimum number of subdirectories per directories; default is 1 + -maxWidth : maximum number of subdirectories per directories; default is 5 + -numOfFiles <#OfFiles> : the total number of files; default is 10. + -avgFileSize : average size of blocks; default is 1. + -outDir : output directory; default is the current directory. + -seed : random number generator seed; default is the current time. + */ +public class StructureGenerator { + private int maxDepth = 5; + private int minWidth = 1; + private int maxWidth = 5; + private int numOfFiles = 10; + private double avgFileSize = 1; + private File outDir = DEFAULT_STRUCTURE_DIRECTORY; + final static private String USAGE = "java StructureGenerator\n" + + "-maxDepth \n" + + "-minWidth \n" + + "-maxWidth \n" + + "-numOfFiles <#OfFiles>\n" + + "-avgFileSize \n" + + "-outDir \n" + + "-seed "; + + private Random r = null; + + /** Default directory for storing file/directory structure */ + final static File DEFAULT_STRUCTURE_DIRECTORY = new File("."); + /** The name of the file for storing directory structure */ + final static String DIR_STRUCTURE_FILE_NAME = "dirStructure"; + /** The name of the file for storing file structure */ + final static String FILE_STRUCTURE_FILE_NAME = "fileStructure"; + /** The name prefix for the files created by this program */ + final static String FILE_NAME_PREFIX = "_file_"; + + /** + * The main function first parses the command line arguments, + * then generates in-memory directory structure and outputs to a file, + * last generates in-memory files and outputs them to a file. + */ + public int run(String[] args) throws Exception { + int exitCode = 0; + exitCode = init(args); + if (exitCode != 0) { + return exitCode; + } + genDirStructure(); + output(new File(outDir, DIR_STRUCTURE_FILE_NAME)); + genFileStructure(); + outputFiles(new File(outDir, FILE_STRUCTURE_FILE_NAME)); + return exitCode; + } + + /** Parse the command line arguments and initialize the data */ + private int init(String[] args) { + try { + for (int i = 0; i < args.length; i++) { // parse command line + if (args[i].equals("-maxDepth")) { + maxDepth = Integer.parseInt(args[++i]); + if (maxDepth<1) { + System.err.println("maxDepth must be positive: " + maxDepth); + return -1; + } + } else if (args[i].equals("-minWidth")) { + minWidth = Integer.parseInt(args[++i]); + if (minWidth<0) { + System.err.println("minWidth must be positive: " + minWidth); + return -1; + } + } else if (args[i].equals("-maxWidth")) { + maxWidth = Integer.parseInt(args[++i]); + } else if (args[i].equals("-numOfFiles")) { + numOfFiles = Integer.parseInt(args[++i]); + if (numOfFiles<1) { + System.err.println("NumOfFiles must be positive: " + numOfFiles); + return -1; + } + } else if (args[i].equals("-avgFileSize")) { + avgFileSize = Double.parseDouble(args[++i]); + if (avgFileSize<=0) { + System.err.println("AvgFileSize must be positive: " + avgFileSize); + return -1; + } + } else if (args[i].equals("-outDir")) { + outDir = new File(args[++i]); + } else if (args[i].equals("-seed")) { + r = new Random(Long.parseLong(args[++i])); + } else { + System.err.println(USAGE); + ToolRunner.printGenericCommandUsage(System.err); + return -1; + } + } + } catch (NumberFormatException e) { + System.err.println("Illegal parameter: " + e.getLocalizedMessage()); + System.err.println(USAGE); + return -1; + } + + if (maxWidth < minWidth) { + System.err.println( + "maxWidth must be bigger than minWidth: " + maxWidth); + return -1; + } + + if (r==null) { + r = new Random(); + } + return 0; + } + + /** In memory representation of a directory */ + private static class INode { + private String name; + private List children = new ArrayList(); + + /** Constructor */ + private INode(String name) { + this.name = name; + } + + /** Add a child (subdir/file) */ + private void addChild(INode child) { + children.add(child); + } + + /** Output the subtree rooted at the current node. + * Only the leaves are printed. + */ + private void output(PrintStream out, String prefix) { + prefix = prefix==null?name:prefix+"/"+name; + if (children.isEmpty()) { + out.println(prefix); + } else { + for (INode child : children) { + child.output(out, prefix); + } + } + } + + /** Output the files in the subtree rooted at this node */ + protected void outputFiles(PrintStream out, String prefix) { + prefix = prefix==null?name:prefix+"/"+name; + for (INode child : children) { + child.outputFiles(out, prefix); + } + } + + /** Add all the leaves in the subtree to the input list */ + private void getLeaves(List leaves) { + if (children.isEmpty()) { + leaves.add(this); + } else { + for (INode child : children) { + child.getLeaves(leaves); + } + } + } + } + + /** In memory representation of a file */ + private static class FileINode extends INode { + private double numOfBlocks; + + /** constructor */ + private FileINode(String name, double numOfBlocks) { + super(name); + this.numOfBlocks = numOfBlocks; + } + + /** Output a file attribute */ + protected void outputFiles(PrintStream out, String prefix) { + prefix = (prefix == null)?super.name: prefix + "/"+super.name; + out.println(prefix + " " + numOfBlocks); + } + } + + private INode root; + + /** Generates a directory tree with a max depth of maxDepth */ + private void genDirStructure() { + root = genDirStructure("", maxDepth); + } + + /** Generate a directory tree rooted at rootName + * The number of subtree is in the range of [minWidth, maxWidth]. + * The maximum depth of each subtree is in the range of + * [2*maxDepth/3, maxDepth]. + */ + private INode genDirStructure(String rootName, int maxDepth) { + INode root = new INode(rootName); + + if (maxDepth>0) { + maxDepth--; + int minDepth = maxDepth*2/3; + // Figure out the number of subdirectories to generate + int numOfSubDirs = minWidth + r.nextInt(maxWidth-minWidth+1); + // Expand the tree + for (int i=0; i getLeaves() { + List leaveDirs = new ArrayList(); + root.getLeaves(leaveDirs); + return leaveDirs; + } + + /** Decides where to place all the files and its length. + * It first collects all empty directories in the tree. + * For each file, it randomly chooses an empty directory to place the file. + * The file's length is generated using Gaussian distribution. + */ + private void genFileStructure() { + List leaves = getLeaves(); + int totalLeaves = leaves.size(); + for (int i=0; i inodes = new TreeMap(); + private Map blocks = new HashMap(); + + public void initialize(URI uri, Configuration conf) { + this.conf = conf; + } + + public String getVersion() throws IOException { + return "0"; + } + + public void deleteINode(Path path) throws IOException { + inodes.remove(normalize(path)); + } + + public void deleteBlock(Block block) throws IOException { + blocks.remove(block.getId()); + } + + public boolean inodeExists(Path path) throws IOException { + return inodes.containsKey(normalize(path)); + } + + public boolean blockExists(long blockId) throws IOException { + return blocks.containsKey(blockId); + } + + public INode retrieveINode(Path path) throws IOException { + return inodes.get(normalize(path)); + } + + public File retrieveBlock(Block block, long byteRangeStart) throws IOException { + byte[] data = blocks.get(block.getId()); + File file = createTempFile(); + BufferedOutputStream out = null; + try { + out = new BufferedOutputStream(new FileOutputStream(file)); + out.write(data, (int) byteRangeStart, data.length - (int) byteRangeStart); + } finally { + if (out != null) { + out.close(); + } + } + return file; + } + + private File createTempFile() throws IOException { + File dir = new File(conf.get("fs.s3.buffer.dir")); + if (!dir.exists() && !dir.mkdirs()) { + throw new IOException("Cannot create S3 buffer directory: " + dir); + } + File result = File.createTempFile("test-", ".tmp", dir); + result.deleteOnExit(); + return result; + } + + public Set listSubPaths(Path path) throws IOException { + Path normalizedPath = normalize(path); + // This is inefficient but more than adequate for testing purposes. + Set subPaths = new LinkedHashSet(); + for (Path p : inodes.tailMap(normalizedPath).keySet()) { + if (normalizedPath.equals(p.getParent())) { + subPaths.add(p); + } + } + return subPaths; + } + + public Set listDeepSubPaths(Path path) throws IOException { + Path normalizedPath = normalize(path); + String pathString = normalizedPath.toUri().getPath(); + if (!pathString.endsWith("/")) { + pathString += "/"; + } + // This is inefficient but more than adequate for testing purposes. + Set subPaths = new LinkedHashSet(); + for (Path p : inodes.tailMap(normalizedPath).keySet()) { + if (p.toUri().getPath().startsWith(pathString)) { + subPaths.add(p); + } + } + return subPaths; + } + + public void storeINode(Path path, INode inode) throws IOException { + inodes.put(normalize(path), inode); + } + + public void storeBlock(Block block, File file) throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + byte[] buf = new byte[8192]; + int numRead; + BufferedInputStream in = null; + try { + in = new BufferedInputStream(new FileInputStream(file)); + while ((numRead = in.read(buf)) >= 0) { + out.write(buf, 0, numRead); + } + } finally { + if (in != null) { + in.close(); + } + } + blocks.put(block.getId(), out.toByteArray()); + } + + private Path normalize(Path path) { + if (!path.isAbsolute()) { + throw new IllegalArgumentException("Path must be absolute: " + path); + } + return new Path(path.toUri().getPath()); + } + + public void purge() throws IOException { + inodes.clear(); + blocks.clear(); + } + + public void dump() throws IOException { + StringBuilder sb = new StringBuilder(getClass().getSimpleName()); + sb.append(", \n"); + for (Map.Entry entry : inodes.entrySet()) { + sb.append(entry.getKey()).append("\n"); + INode inode = entry.getValue(); + sb.append("\t").append(inode.getFileType()).append("\n"); + if (inode.getFileType() == FileType.DIRECTORY) { + continue; + } + for (int j = 0; j < inode.getBlocks().length; j++) { + sb.append("\t").append(inode.getBlocks()[j]).append("\n"); + } + } + System.out.println(sb); + + System.out.println(inodes.keySet()); + System.out.println(blocks.keySet()); + } + +} diff --git a/src/test/org/apache/hadoop/fs/s3/Jets3tS3FileSystemContractTest.java b/src/test/org/apache/hadoop/fs/s3/Jets3tS3FileSystemContractTest.java new file mode 100644 index 00000000000..53b3c03c414 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3/Jets3tS3FileSystemContractTest.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3; + +import java.io.IOException; + +public class Jets3tS3FileSystemContractTest + extends S3FileSystemContractBaseTest { + + @Override + FileSystemStore getFileSystemStore() throws IOException { + return new Jets3tFileSystemStore(); + } + +} diff --git a/src/test/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java b/src/test/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java new file mode 100644 index 00000000000..8d6744a12a3 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java @@ -0,0 +1,48 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3; + +import java.io.IOException; +import java.net.URI; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystemContractBaseTest; + +public abstract class S3FileSystemContractBaseTest + extends FileSystemContractBaseTest { + + private FileSystemStore store; + + abstract FileSystemStore getFileSystemStore() throws IOException; + + @Override + protected void setUp() throws Exception { + Configuration conf = new Configuration(); + store = getFileSystemStore(); + fs = new S3FileSystem(store); + fs.initialize(URI.create(conf.get("test.fs.s3.name")), conf); + } + + @Override + protected void tearDown() throws Exception { + store.purge(); + super.tearDown(); + } + +} diff --git a/src/test/org/apache/hadoop/fs/s3/TestINode.java b/src/test/org/apache/hadoop/fs/s3/TestINode.java new file mode 100644 index 00000000000..086a43eabca --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3/TestINode.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3; + +import java.io.IOException; +import java.io.InputStream; + +import junit.framework.TestCase; + +import org.apache.hadoop.fs.s3.INode.FileType; + +public class TestINode extends TestCase { + + public void testSerializeFileWithSingleBlock() throws IOException { + Block[] blocks = { new Block(849282477840258181L, 128L) }; + INode inode = new INode(FileType.FILE, blocks); + + assertEquals("Length", 1L + 4 + 16, inode.getSerializedLength()); + InputStream in = inode.serialize(); + + INode deserialized = INode.deserialize(in); + + assertEquals("FileType", inode.getFileType(), deserialized.getFileType()); + Block[] deserializedBlocks = deserialized.getBlocks(); + assertEquals("Length", 1, deserializedBlocks.length); + assertEquals("Id", blocks[0].getId(), deserializedBlocks[0].getId()); + assertEquals("Length", blocks[0].getLength(), deserializedBlocks[0] + .getLength()); + + } + + public void testSerializeDirectory() throws IOException { + INode inode = INode.DIRECTORY_INODE; + assertEquals("Length", 1L, inode.getSerializedLength()); + InputStream in = inode.serialize(); + INode deserialized = INode.deserialize(in); + assertSame(INode.DIRECTORY_INODE, deserialized); + } + + public void testDeserializeNull() throws IOException { + assertNull(INode.deserialize(null)); + } + +} diff --git a/src/test/org/apache/hadoop/fs/s3/TestInMemoryS3FileSystemContract.java b/src/test/org/apache/hadoop/fs/s3/TestInMemoryS3FileSystemContract.java new file mode 100644 index 00000000000..5d66cf12c85 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3/TestInMemoryS3FileSystemContract.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3; + +import java.io.IOException; + +public class TestInMemoryS3FileSystemContract + extends S3FileSystemContractBaseTest { + + @Override + FileSystemStore getFileSystemStore() throws IOException { + return new InMemoryFileSystemStore(); + } + +} diff --git a/src/test/org/apache/hadoop/fs/s3/TestS3Credentials.java b/src/test/org/apache/hadoop/fs/s3/TestS3Credentials.java new file mode 100644 index 00000000000..bcbf0dc607a --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3/TestS3Credentials.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs.s3; + +import java.net.URI; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; + +public class TestS3Credentials extends TestCase { + public void testInvalidHostnameWithUnderscores() throws Exception { + S3Credentials s3Credentials = new S3Credentials(); + try { + s3Credentials.initialize(new URI("s3://a:b@c_d"), new Configuration()); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + assertEquals("Invalid hostname in URI s3://a:b@c_d", e.getMessage()); + } + } +} diff --git a/src/test/org/apache/hadoop/fs/s3/TestS3FileSystem.java b/src/test/org/apache/hadoop/fs/s3/TestS3FileSystem.java new file mode 100644 index 00000000000..f21989c5d97 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3/TestS3FileSystem.java @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3; + +import java.io.IOException; +import java.net.URI; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; + +public class TestS3FileSystem extends TestCase { + + public void testInitialization() throws IOException { + initializationTest("s3://a:b@c", "s3://a:b@c"); + initializationTest("s3://a:b@c/", "s3://a:b@c"); + initializationTest("s3://a:b@c/path", "s3://a:b@c"); + initializationTest("s3://a@c", "s3://a@c"); + initializationTest("s3://a@c/", "s3://a@c"); + initializationTest("s3://a@c/path", "s3://a@c"); + initializationTest("s3://c", "s3://c"); + initializationTest("s3://c/", "s3://c"); + initializationTest("s3://c/path", "s3://c"); + } + + private void initializationTest(String initializationUri, String expectedUri) + throws IOException { + + S3FileSystem fs = new S3FileSystem(new InMemoryFileSystemStore()); + fs.initialize(URI.create(initializationUri), new Configuration()); + assertEquals(URI.create(expectedUri), fs.getUri()); + } + +} diff --git a/src/test/org/apache/hadoop/fs/s3native/InMemoryNativeFileSystemStore.java b/src/test/org/apache/hadoop/fs/s3native/InMemoryNativeFileSystemStore.java new file mode 100644 index 00000000000..d3086da9e82 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3native/InMemoryNativeFileSystemStore.java @@ -0,0 +1,198 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3native; + +import static org.apache.hadoop.fs.s3native.NativeS3FileSystem.PATH_DELIMITER; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; + +/** + *

+ * A stub implementation of {@link NativeFileSystemStore} for testing + * {@link NativeS3FileSystem} without actually connecting to S3. + *

+ */ +class InMemoryNativeFileSystemStore implements NativeFileSystemStore { + + private Configuration conf; + + private SortedMap metadataMap = + new TreeMap(); + private SortedMap dataMap = new TreeMap(); + + public void initialize(URI uri, Configuration conf) throws IOException { + this.conf = conf; + } + + public void storeEmptyFile(String key) throws IOException { + metadataMap.put(key, new FileMetadata(key, 0, System.currentTimeMillis())); + dataMap.put(key, new byte[0]); + } + + public void storeFile(String key, File file, byte[] md5Hash) + throws IOException { + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + byte[] buf = new byte[8192]; + int numRead; + BufferedInputStream in = null; + try { + in = new BufferedInputStream(new FileInputStream(file)); + while ((numRead = in.read(buf)) >= 0) { + out.write(buf, 0, numRead); + } + } finally { + if (in != null) { + in.close(); + } + } + metadataMap.put(key, + new FileMetadata(key, file.length(), System.currentTimeMillis())); + dataMap.put(key, out.toByteArray()); + } + + public InputStream retrieve(String key) throws IOException { + return retrieve(key, 0); + } + + public InputStream retrieve(String key, long byteRangeStart) + throws IOException { + + byte[] data = dataMap.get(key); + File file = createTempFile(); + BufferedOutputStream out = null; + try { + out = new BufferedOutputStream(new FileOutputStream(file)); + out.write(data, (int) byteRangeStart, + data.length - (int) byteRangeStart); + } finally { + if (out != null) { + out.close(); + } + } + return new FileInputStream(file); + } + + private File createTempFile() throws IOException { + File dir = new File(conf.get("fs.s3.buffer.dir")); + if (!dir.exists() && !dir.mkdirs()) { + throw new IOException("Cannot create S3 buffer directory: " + dir); + } + File result = File.createTempFile("test-", ".tmp", dir); + result.deleteOnExit(); + return result; + } + + public FileMetadata retrieveMetadata(String key) throws IOException { + return metadataMap.get(key); + } + + public PartialListing list(String prefix, int maxListingLength) + throws IOException { + return list(prefix, maxListingLength, null); + } + + public PartialListing list(String prefix, int maxListingLength, + String priorLastKey) throws IOException { + + return list(prefix, PATH_DELIMITER, maxListingLength, priorLastKey); + } + + public PartialListing listAll(String prefix, int maxListingLength, + String priorLastKey) throws IOException { + + return list(prefix, null, maxListingLength, priorLastKey); + } + + private PartialListing list(String prefix, String delimiter, + int maxListingLength, String priorLastKey) throws IOException { + + if (prefix.length() > 0 && !prefix.endsWith(PATH_DELIMITER)) { + prefix += PATH_DELIMITER; + } + + List metadata = new ArrayList(); + SortedSet commonPrefixes = new TreeSet(); + for (String key : dataMap.keySet()) { + if (key.startsWith(prefix)) { + if (delimiter == null) { + metadata.add(retrieveMetadata(key)); + } else { + int delimIndex = key.indexOf(delimiter, prefix.length()); + if (delimIndex == -1) { + metadata.add(retrieveMetadata(key)); + } else { + String commonPrefix = key.substring(0, delimIndex); + commonPrefixes.add(commonPrefix); + } + } + } + if (metadata.size() + commonPrefixes.size() == maxListingLength) { + new PartialListing(key, metadata.toArray(new FileMetadata[0]), + commonPrefixes.toArray(new String[0])); + } + } + return new PartialListing(null, metadata.toArray(new FileMetadata[0]), + commonPrefixes.toArray(new String[0])); + } + + public void delete(String key) throws IOException { + metadataMap.remove(key); + dataMap.remove(key); + } + + public void rename(String srcKey, String dstKey) throws IOException { + metadataMap.put(dstKey, metadataMap.remove(srcKey)); + dataMap.put(dstKey, dataMap.remove(srcKey)); + } + + public void purge(String prefix) throws IOException { + Iterator> i = + metadataMap.entrySet().iterator(); + while (i.hasNext()) { + Entry entry = i.next(); + if (entry.getKey().startsWith(prefix)) { + dataMap.remove(entry.getKey()); + i.remove(); + } + } + } + + public void dump() throws IOException { + System.out.println(metadataMap.values()); + System.out.println(dataMap.keySet()); + } +} diff --git a/src/test/org/apache/hadoop/fs/s3native/Jets3tNativeS3FileSystemContractTest.java b/src/test/org/apache/hadoop/fs/s3native/Jets3tNativeS3FileSystemContractTest.java new file mode 100644 index 00000000000..6516c836f88 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3native/Jets3tNativeS3FileSystemContractTest.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3native; + +import java.io.IOException; + +public class Jets3tNativeS3FileSystemContractTest + extends NativeS3FileSystemContractBaseTest { + + @Override + NativeFileSystemStore getNativeFileSystemStore() throws IOException { + return new Jets3tNativeFileSystemStore(); + } +} diff --git a/src/test/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java b/src/test/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java new file mode 100644 index 00000000000..bf2e3c3d387 --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3native; + +import java.io.IOException; +import java.net.URI; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystemContractBaseTest; +import org.apache.hadoop.fs.Path; + +public abstract class NativeS3FileSystemContractBaseTest + extends FileSystemContractBaseTest { + + private NativeFileSystemStore store; + + abstract NativeFileSystemStore getNativeFileSystemStore() throws IOException; + + @Override + protected void setUp() throws Exception { + Configuration conf = new Configuration(); + store = getNativeFileSystemStore(); + fs = new NativeS3FileSystem(store); + fs.initialize(URI.create(conf.get("test.fs.s3n.name")), conf); + } + + @Override + protected void tearDown() throws Exception { + store.purge("test"); + super.tearDown(); + } + + public void testListStatusForRoot() throws Exception { + Path testDir = path("/test"); + assertTrue(fs.mkdirs(testDir)); + + FileStatus[] paths = fs.listStatus(path("/")); + assertEquals(1, paths.length); + assertEquals(path("/test"), paths[0].getPath()); + } + +} diff --git a/src/test/org/apache/hadoop/fs/s3native/TestInMemoryNativeS3FileSystemContract.java b/src/test/org/apache/hadoop/fs/s3native/TestInMemoryNativeS3FileSystemContract.java new file mode 100644 index 00000000000..664d39e6f4f --- /dev/null +++ b/src/test/org/apache/hadoop/fs/s3native/TestInMemoryNativeS3FileSystemContract.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3native; + +import java.io.IOException; + +public class TestInMemoryNativeS3FileSystemContract + extends NativeS3FileSystemContractBaseTest { + + @Override + NativeFileSystemStore getNativeFileSystemStore() throws IOException { + return new InMemoryNativeFileSystemStore(); + } +} diff --git a/src/test/org/apache/hadoop/http/TestGlobalFilter.java b/src/test/org/apache/hadoop/http/TestGlobalFilter.java new file mode 100644 index 00000000000..51ab60697f2 --- /dev/null +++ b/src/test/org/apache/hadoop/http/TestGlobalFilter.java @@ -0,0 +1,139 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.http; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.util.Set; +import java.util.TreeSet; + +import javax.servlet.Filter; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServletRequest; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; + +public class TestGlobalFilter extends junit.framework.TestCase { + static final Log LOG = LogFactory.getLog(HttpServer.class); + static final Set RECORDS = new TreeSet(); + + /** A very simple filter that records accessed uri's */ + static public class RecordingFilter implements Filter { + private FilterConfig filterConfig = null; + + public void init(FilterConfig filterConfig) { + this.filterConfig = filterConfig; + } + + public void destroy() { + this.filterConfig = null; + } + + public void doFilter(ServletRequest request, ServletResponse response, + FilterChain chain) throws IOException, ServletException { + if (filterConfig == null) + return; + + String uri = ((HttpServletRequest)request).getRequestURI(); + LOG.info("filtering " + uri); + RECORDS.add(uri); + chain.doFilter(request, response); + } + + /** Configuration for RecordingFilter */ + static public class Initializer extends FilterInitializer { + public Initializer() {} + + void initFilter(FilterContainer container) { + container.addGlobalFilter("recording", RecordingFilter.class.getName(), null); + } + } + } + + + /** access a url, ignoring some IOException such as the page does not exist */ + static void access(String urlstring) throws IOException { + LOG.warn("access " + urlstring); + URL url = new URL(urlstring); + URLConnection connection = url.openConnection(); + connection.connect(); + + try { + BufferedReader in = new BufferedReader(new InputStreamReader( + connection.getInputStream())); + try { + for(; in.readLine() != null; ); + } finally { + in.close(); + } + } catch(IOException ioe) { + LOG.warn("urlstring=" + urlstring, ioe); + } + } + + public void testServletFilter() throws Exception { + Configuration conf = new Configuration(); + + //start a http server with CountingFilter + conf.set(HttpServer.FILTER_INITIALIZER_PROPERTY, + RecordingFilter.Initializer.class.getName()); + HttpServer http = new HttpServer("datanode", "localhost", 0, true, conf); + http.start(); + + final String fsckURL = "/fsck"; + final String stacksURL = "/stacks"; + final String ajspURL = "/a.jsp"; + final String listPathsURL = "/listPaths"; + final String dataURL = "/data"; + final String streamFile = "/streamFile"; + final String rootURL = "/"; + final String allURL = "/*"; + final String outURL = "/static/a.out"; + final String logURL = "/logs/a.log"; + + final String[] urls = {fsckURL, stacksURL, ajspURL, listPathsURL, + dataURL, streamFile, rootURL, allURL, outURL, logURL}; + + //access the urls + final String prefix = "http://localhost:" + http.getPort(); + try { + for(int i = 0; i < urls.length; i++) { + access(prefix + urls[i]); + } + } finally { + http.stop(); + } + + LOG.info("RECORDS = " + RECORDS); + + //verify records + for(int i = 0; i < urls.length; i++) { + assertTrue(RECORDS.remove(urls[i])); + } + assertTrue(RECORDS.isEmpty()); + } +} diff --git a/src/test/org/apache/hadoop/http/TestServletFilter.java b/src/test/org/apache/hadoop/http/TestServletFilter.java new file mode 100644 index 00000000000..8052f9ad492 --- /dev/null +++ b/src/test/org/apache/hadoop/http/TestServletFilter.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.http; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.util.Random; + +import javax.servlet.Filter; +import javax.servlet.FilterChain; +import javax.servlet.FilterConfig; +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServletRequest; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; + +public class TestServletFilter extends junit.framework.TestCase { + static final Log LOG = LogFactory.getLog(HttpServer.class); + static volatile String uri = null; + + /** A very simple filter which record the uri filtered. */ + static public class SimpleFilter implements Filter { + private FilterConfig filterConfig = null; + + public void init(FilterConfig filterConfig) { + this.filterConfig = filterConfig; + } + + public void destroy() { + this.filterConfig = null; + } + + public void doFilter(ServletRequest request, ServletResponse response, + FilterChain chain) throws IOException, ServletException { + if (filterConfig == null) + return; + + uri = ((HttpServletRequest)request).getRequestURI(); + LOG.info("filtering " + uri); + chain.doFilter(request, response); + } + + /** Configuration for the filter */ + static public class Initializer extends FilterInitializer { + public Initializer() {} + + void initFilter(FilterContainer container) { + container.addFilter("simple", SimpleFilter.class.getName(), null); + } + } + } + + + /** access a url, ignoring some IOException such as the page does not exist */ + static void access(String urlstring) throws IOException { + LOG.warn("access " + urlstring); + URL url = new URL(urlstring); + URLConnection connection = url.openConnection(); + connection.connect(); + + try { + BufferedReader in = new BufferedReader(new InputStreamReader( + connection.getInputStream())); + try { + for(; in.readLine() != null; ); + } finally { + in.close(); + } + } catch(IOException ioe) { + LOG.warn("urlstring=" + urlstring, ioe); + } + } + + public void testServletFilter() throws Exception { + Configuration conf = new Configuration(); + + //start a http server with CountingFilter + conf.set(HttpServer.FILTER_INITIALIZER_PROPERTY, + SimpleFilter.Initializer.class.getName()); + HttpServer http = new HttpServer("datanode", "localhost", 0, true, conf); + http.start(); + + final String fsckURL = "/fsck"; + final String stacksURL = "/stacks"; + final String ajspURL = "/a.jsp"; + final String logURL = "/logs/a.log"; + final String hadooplogoURL = "/static/hadoop-logo.jpg"; + + final String[] urls = {fsckURL, stacksURL, ajspURL, logURL, hadooplogoURL}; + final Random ran = new Random(); + final int[] sequence = new int[50]; + + //generate a random sequence and update counts + for(int i = 0; i < sequence.length; i++) { + sequence[i] = ran.nextInt(urls.length); + } + + //access the urls as the sequence + final String prefix = "http://localhost:" + http.getPort(); + try { + for(int i = 0; i < sequence.length; i++) { + access(prefix + urls[sequence[i]]); + + //make sure everything except fsck get filtered + if (sequence[i] == 0) { + assertEquals(null, uri); + } else { + assertEquals(urls[sequence[i]], uri); + uri = null; + } + } + } finally { + http.stop(); + } + } +} diff --git a/src/test/org/apache/hadoop/io/RandomDatum.java b/src/test/org/apache/hadoop/io/RandomDatum.java new file mode 100644 index 00000000000..ab8f34febab --- /dev/null +++ b/src/test/org/apache/hadoop/io/RandomDatum.java @@ -0,0 +1,108 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.util.*; +import java.io.*; + +public class RandomDatum implements WritableComparable { + private int length; + private byte[] data; + + public RandomDatum() {} + + public RandomDatum(Random random) { + length = 10 + (int) Math.pow(10.0, random.nextFloat() * 3.0); + data = new byte[length]; + random.nextBytes(data); + } + + public int getLength() { + return length; + } + + public void write(DataOutput out) throws IOException { + out.writeInt(length); + out.write(data); + } + + public void readFields(DataInput in) throws IOException { + length = in.readInt(); + if (data == null || length > data.length) + data = new byte[length]; + in.readFully(data, 0, length); + } + + public int compareTo(Object o) { + RandomDatum that = (RandomDatum)o; + return WritableComparator.compareBytes(this.data, 0, this.length, + that.data, 0, that.length); + } + + public boolean equals(Object o) { + return compareTo(o) == 0; + } + + private static final char[] HEX_DIGITS = + {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; + + /** Returns a string representation of this object. */ + public String toString() { + StringBuffer buf = new StringBuffer(length*2); + for (int i = 0; i < length; i++) { + int b = data[i]; + buf.append(HEX_DIGITS[(b >> 4) & 0xf]); + buf.append(HEX_DIGITS[b & 0xf]); + } + return buf.toString(); + } + + public static class Generator { + Random random; + + private RandomDatum key; + private RandomDatum value; + + public Generator() { random = new Random(); } + public Generator(int seed) { random = new Random(seed); } + + public RandomDatum getKey() { return key; } + public RandomDatum getValue() { return value; } + + public void next() { + key = new RandomDatum(random); + value = new RandomDatum(random); + } + } + + /** A WritableComparator optimized for RandomDatum. */ + public static class Comparator extends WritableComparator { + public Comparator() { + super(RandomDatum.class); + } + + public int compare(byte[] b1, int s1, int l1, + byte[] b2, int s2, int l2) { + int n1 = readInt(b1, s1); + int n2 = readInt(b2, s2); + return compareBytes(b1, s1+4, n1, b2, s2+4, n2); + } + } + +} diff --git a/src/test/org/apache/hadoop/io/TestArrayFile.java b/src/test/org/apache/hadoop/io/TestArrayFile.java new file mode 100644 index 00000000000..f279bd74319 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestArrayFile.java @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.io.*; +import junit.framework.TestCase; + +import org.apache.commons.logging.*; + +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; + +/** Support for flat files of binary key/value pairs. */ +public class TestArrayFile extends TestCase { + private static final Log LOG = LogFactory.getLog(TestArrayFile.class); + private static String FILE = + System.getProperty("test.build.data",".") + "/test.array"; + + public TestArrayFile(String name) { + super(name); + } + + public void testArrayFile() throws Exception { + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.getLocal(conf); + RandomDatum[] data = generate(10000); + writeTest(fs, data, FILE); + readTest(fs, data, FILE, conf); + } + + public void testEmptyFile() throws Exception { + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.getLocal(conf); + writeTest(fs, new RandomDatum[0], FILE); + ArrayFile.Reader reader = new ArrayFile.Reader(fs, FILE, conf); + assertNull(reader.get(0, new RandomDatum())); + reader.close(); + } + + private static RandomDatum[] generate(int count) { + LOG.debug("generating " + count + " records in debug"); + RandomDatum[] data = new RandomDatum[count]; + RandomDatum.Generator generator = new RandomDatum.Generator(); + for (int i = 0; i < count; i++) { + generator.next(); + data[i] = generator.getValue(); + } + return data; + } + + private static void writeTest(FileSystem fs, RandomDatum[] data, String file) + throws IOException { + Configuration conf = new Configuration(); + MapFile.delete(fs, file); + LOG.debug("creating with " + data.length + " debug"); + ArrayFile.Writer writer = new ArrayFile.Writer(conf, fs, file, RandomDatum.class); + writer.setIndexInterval(100); + for (int i = 0; i < data.length; i++) + writer.append(data[i]); + writer.close(); + } + + private static void readTest(FileSystem fs, RandomDatum[] data, String file, Configuration conf) + throws IOException { + RandomDatum v = new RandomDatum(); + LOG.debug("reading " + data.length + " debug"); + ArrayFile.Reader reader = new ArrayFile.Reader(fs, file, conf); + for (int i = 0; i < data.length; i++) { // try forwards + reader.get(i, v); + if (!v.equals(data[i])) { + throw new RuntimeException("wrong value at " + i); + } + } + for (int i = data.length-1; i >= 0; i--) { // then backwards + reader.get(i, v); + if (!v.equals(data[i])) { + throw new RuntimeException("wrong value at " + i); + } + } + reader.close(); + LOG.debug("done reading " + data.length + " debug"); + } + + + /** For debugging and testing. */ + public static void main(String[] args) throws Exception { + int count = 1024 * 1024; + boolean create = true; + boolean check = true; + String file = FILE; + String usage = "Usage: TestArrayFile [-count N] [-nocreate] [-nocheck] file"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + Configuration conf = new Configuration(); + int i = 0; + Path fpath = null; + FileSystem fs = null; + try { + for (; i < args.length; i++) { // parse command line + if (args[i] == null) { + continue; + } else if (args[i].equals("-count")) { + count = Integer.parseInt(args[++i]); + } else if (args[i].equals("-nocreate")) { + create = false; + } else if (args[i].equals("-nocheck")) { + check = false; + } else { + // file is required parameter + file = args[i]; + fpath=new Path(file); + } + } + + fs = fpath.getFileSystem(conf); + + LOG.info("count = " + count); + LOG.info("create = " + create); + LOG.info("check = " + check); + LOG.info("file = " + file); + + RandomDatum[] data = generate(count); + + if (create) { + writeTest(fs, data, file); + } + + if (check) { + readTest(fs, data, file, conf); + } + } finally { + fs.close(); + } + } +} diff --git a/src/test/org/apache/hadoop/io/TestArrayWritable.java b/src/test/org/apache/hadoop/io/TestArrayWritable.java new file mode 100644 index 00000000000..47d0ce9f635 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestArrayWritable.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.io.*; + +import junit.framework.TestCase; + +/** Unit tests for ArrayWritable */ +public class TestArrayWritable extends TestCase { + + static class TextArrayWritable extends ArrayWritable { + public TextArrayWritable() { + super(Text.class); + } + } + + public TestArrayWritable(String name) { + super(name); + } + + /** + * If valueClass is undefined, readFields should throw an exception indicating + * that the field is null. Otherwise, readFields should succeed. + */ + public void testThrowUndefinedValueException() throws IOException { + // Get a buffer containing a simple text array + Text[] elements = {new Text("zero"), new Text("one"), new Text("two")}; + TextArrayWritable sourceArray = new TextArrayWritable(); + sourceArray.set(elements); + + // Write it to a normal output buffer + DataOutputBuffer out = new DataOutputBuffer(); + DataInputBuffer in = new DataInputBuffer(); + sourceArray.write(out); + + // Read the output buffer with TextReadable. Since the valueClass is defined, + // this should succeed + TextArrayWritable destArray = new TextArrayWritable(); + in.reset(out.getData(), out.getLength()); + destArray.readFields(in); + Writable[] destElements = destArray.get(); + assertTrue(destElements.length == elements.length); + for (int i = 0; i < elements.length; i++) { + assertEquals(destElements[i],elements[i]); + } + } +} diff --git a/src/test/org/apache/hadoop/io/TestBloomMapFile.java b/src/test/org/apache/hadoop/io/TestBloomMapFile.java new file mode 100644 index 00000000000..2a7d22455f6 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestBloomMapFile.java @@ -0,0 +1,70 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import junit.framework.TestCase; + +public class TestBloomMapFile extends TestCase { + private static Configuration conf = new Configuration(); + + public void testMembershipTest() throws Exception { + // write the file + Path dirName = new Path(System.getProperty("test.build.data",".") + + getName() + ".bloommapfile"); + FileSystem fs = FileSystem.getLocal(conf); + Path qualifiedDirName = fs.makeQualified(dirName); + conf.setInt("io.mapfile.bloom.size", 2048); + BloomMapFile.Writer writer = new BloomMapFile.Writer(conf, fs, + qualifiedDirName.toString(), IntWritable.class, Text.class); + IntWritable key = new IntWritable(); + Text value = new Text(); + for (int i = 0; i < 2000; i += 2) { + key.set(i); + value.set("00" + i); + writer.append(key, value); + } + writer.close(); + + BloomMapFile.Reader reader = new BloomMapFile.Reader(fs, + qualifiedDirName.toString(), conf); + // check false positives rate + int falsePos = 0; + int falseNeg = 0; + for (int i = 0; i < 2000; i++) { + key.set(i); + boolean exists = reader.probablyHasKey(key); + if (i % 2 == 0) { + if (!exists) falseNeg++; + } else { + if (exists) falsePos++; + } + } + reader.close(); + fs.delete(qualifiedDirName, true); + System.out.println("False negatives: " + falseNeg); + assertEquals(0, falseNeg); + System.out.println("False positives: " + falsePos); + assertTrue(falsePos < 2); + } + +} diff --git a/src/test/org/apache/hadoop/io/TestBytesWritable.java b/src/test/org/apache/hadoop/io/TestBytesWritable.java new file mode 100644 index 00000000000..35e0d0ed827 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestBytesWritable.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.io; + +import junit.framework.TestCase; + +/** + * This is the unit test for BytesWritable. + */ +public class TestBytesWritable extends TestCase { + + public void testSizeChange() throws Exception { + byte[] hadoop = "hadoop".getBytes(); + BytesWritable buf = new BytesWritable(hadoop); + int size = buf.getLength(); + int orig_capacity = buf.getCapacity(); + buf.setSize(size*2); + int new_capacity = buf.getCapacity(); + System.arraycopy(buf.getBytes(), 0, buf.getBytes(), size, size); + assertTrue(new_capacity >= size * 2); + assertEquals(size * 2, buf.getLength()); + assertTrue(new_capacity != orig_capacity); + buf.setSize(size*4); + assertTrue(new_capacity != buf.getCapacity()); + for(int i=0; i < size*2; ++i) { + assertEquals(hadoop[i%size], buf.getBytes()[i]); + } + // shrink the buffer + buf.setCapacity(1); + // make sure the size has been cut down too + assertEquals(1, buf.getLength()); + // but that the data is still there + assertEquals(hadoop[0], buf.getBytes()[0]); + } + + public void testHash() throws Exception { + byte[] owen = "owen".getBytes(); + BytesWritable buf = new BytesWritable(owen); + assertEquals(4347922, buf.hashCode()); + buf.setCapacity(10000); + assertEquals(4347922, buf.hashCode()); + buf.setSize(0); + assertEquals(1, buf.hashCode()); + } + + public void testCompare() throws Exception { + byte[][] values = new byte[][]{"abc".getBytes(), + "ad".getBytes(), + "abcd".getBytes(), + "".getBytes(), + "b".getBytes()}; + BytesWritable[] buf = new BytesWritable[values.length]; + for(int i=0; i < values.length; ++i) { + buf[i] = new BytesWritable(values[i]); + } + // check to make sure the compare function is symetric and reflexive + for(int i=0; i < values.length; ++i) { + for(int j=0; j < values.length; ++j) { + assertTrue(buf[i].compareTo(buf[j]) == -buf[j].compareTo(buf[i])); + assertTrue((i == j) == (buf[i].compareTo(buf[j]) == 0)); + } + } + assertTrue(buf[0].compareTo(buf[1]) < 0); + assertTrue(buf[1].compareTo(buf[2]) > 0); + assertTrue(buf[2].compareTo(buf[3]) > 0); + assertTrue(buf[3].compareTo(buf[4]) < 0); + } + + private void checkToString(byte[] input, String expected) { + String actual = new BytesWritable(input).toString(); + assertEquals(expected, actual); + } + + public void testToString() { + checkToString(new byte[]{0,1,2,0x10}, "00 01 02 10"); + checkToString(new byte[]{-0x80, -0x7f, -0x1, -0x2, 1, 0}, + "80 81 ff fe 01 00"); + } +} + diff --git a/src/test/org/apache/hadoop/io/TestDefaultStringifier.java b/src/test/org/apache/hadoop/io/TestDefaultStringifier.java new file mode 100644 index 00000000000..c96cc732938 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestDefaultStringifier.java @@ -0,0 +1,113 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.io.IOException; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; + +public class TestDefaultStringifier extends TestCase { + + private static Configuration conf = new Configuration(); + private static final Log LOG = LogFactory.getLog(TestDefaultStringifier.class); + + private char[] alphabet = "abcdefghijklmnopqrstuvwxyz".toCharArray(); + + public void testWithWritable() throws Exception { + + conf.set("io.serializations", "org.apache.hadoop.io.serializer.WritableSerialization"); + + LOG.info("Testing DefaultStringifier with Text"); + + Random random = new Random(); + + //test with a Text + for(int i=0;i<10;i++) { + //generate a random string + StringBuilder builder = new StringBuilder(); + int strLen = random.nextInt(40); + for(int j=0; j< strLen; j++) { + builder.append(alphabet[random.nextInt(alphabet.length)]); + } + Text text = new Text(builder.toString()); + DefaultStringifier stringifier = new DefaultStringifier(conf, Text.class); + + String str = stringifier.toString(text); + Text claimedText = stringifier.fromString(str); + LOG.info("Object: " + text); + LOG.info("String representation of the object: " + str); + assertEquals(text, claimedText); + } + } + + public void testWithJavaSerialization() throws Exception { + conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization"); + + LOG.info("Testing DefaultStringifier with Serializable Integer"); + + //Integer implements Serializable + Integer testInt = Integer.valueOf(42); + DefaultStringifier stringifier = new DefaultStringifier(conf, Integer.class); + + String str = stringifier.toString(testInt); + Integer claimedInt = stringifier.fromString(str); + LOG.info("String representation of the object: " + str); + + assertEquals(testInt, claimedInt); + } + + public void testStoreLoad() throws IOException { + + LOG.info("Testing DefaultStringifier#store() and #load()"); + conf.set("io.serializations", "org.apache.hadoop.io.serializer.WritableSerialization"); + Text text = new Text("uninteresting test string"); + String keyName = "test.defaultstringifier.key1"; + + DefaultStringifier.store(conf,text, keyName); + + Text claimedText = DefaultStringifier.load(conf, keyName, Text.class); + assertEquals("DefaultStringifier#load() or #store() might be flawed" + , text, claimedText); + + } + + public void testStoreLoadArray() throws IOException { + LOG.info("Testing DefaultStringifier#storeArray() and #loadArray()"); + conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization"); + + String keyName = "test.defaultstringifier.key2"; + + Integer[] array = new Integer[] {1,2,3,4,5}; + + + DefaultStringifier.storeArray(conf, array, keyName); + + Integer[] claimedArray = DefaultStringifier.loadArray(conf, keyName, Integer.class); + for (int i = 0; i < array.length; i++) { + assertEquals("two arrays are not equal", array[i], claimedArray[i]); + } + + } + +} diff --git a/src/test/org/apache/hadoop/io/TestEnumSetWritable.java b/src/test/org/apache/hadoop/io/TestEnumSetWritable.java new file mode 100644 index 00000000000..a512bb1bc2d --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestEnumSetWritable.java @@ -0,0 +1,103 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.io.IOException; +import java.util.EnumSet; + +import junit.framework.TestCase; + +/** Unit test for EnumSetWritable */ +public class TestEnumSetWritable extends TestCase { + + enum TestEnumSet { + CREATE, OVERWRITE, APPEND; + } + + EnumSet nonEmptyFlag = EnumSet.of(TestEnumSet.APPEND); + EnumSetWritable nonEmptyFlagWritable = new EnumSetWritable( + nonEmptyFlag); + + @SuppressWarnings("unchecked") + public void testSerializeAndDeserializeNonEmpty() throws IOException { + DataOutputBuffer out = new DataOutputBuffer(); + ObjectWritable.writeObject(out, nonEmptyFlagWritable, nonEmptyFlagWritable + .getClass(), null); + DataInputBuffer in = new DataInputBuffer(); + in.reset(out.getData(), out.getLength()); + EnumSet read = ((EnumSetWritable) ObjectWritable + .readObject(in, null)).get(); + assertEquals(read, nonEmptyFlag); + } + + EnumSet emptyFlag = EnumSet.noneOf(TestEnumSet.class); + + @SuppressWarnings("unchecked") + public void testSerializeAndDeserializeEmpty() throws IOException { + + boolean gotException = false; + try { + new EnumSetWritable(emptyFlag); + } catch (RuntimeException e) { + gotException = true; + } + + assertTrue( + "Instantiate empty EnumSetWritable with no element type class providesd should throw exception.", + gotException); + + EnumSetWritable emptyFlagWritable = new EnumSetWritable( + emptyFlag, TestEnumSet.class); + DataOutputBuffer out = new DataOutputBuffer(); + ObjectWritable.writeObject(out, emptyFlagWritable, emptyFlagWritable + .getClass(), null); + DataInputBuffer in = new DataInputBuffer(); + in.reset(out.getData(), out.getLength()); + EnumSet read = ((EnumSetWritable) ObjectWritable + .readObject(in, null)).get(); + assertEquals(read, emptyFlag); + } + + @SuppressWarnings("unchecked") + public void testSerializeAndDeserializeNull() throws IOException { + + boolean gotException = false; + try { + new EnumSetWritable(null); + } catch (RuntimeException e) { + gotException = true; + } + + assertTrue( + "Instantiate empty EnumSetWritable with no element type class providesd should throw exception.", + gotException); + + EnumSetWritable nullFlagWritable = new EnumSetWritable( + null, TestEnumSet.class); + + DataOutputBuffer out = new DataOutputBuffer(); + ObjectWritable.writeObject(out, nullFlagWritable, nullFlagWritable + .getClass(), null); + DataInputBuffer in = new DataInputBuffer(); + in.reset(out.getData(), out.getLength()); + EnumSet read = ((EnumSetWritable) ObjectWritable + .readObject(in, null)).get(); + assertEquals(read, null); + } +} diff --git a/src/test/org/apache/hadoop/io/TestGenericWritable.java b/src/test/org/apache/hadoop/io/TestGenericWritable.java new file mode 100644 index 00000000000..486d93d4385 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestGenericWritable.java @@ -0,0 +1,178 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; + +/** + * TestCase for {@link GenericWritable} class. + * @see TestWritable#testWritable(Writable) + */ +public class TestGenericWritable extends TestCase { + + private Configuration conf; + public static final String CONF_TEST_KEY = "test.generic.writable"; + public static final String CONF_TEST_VALUE = "dummy"; + + @Override + protected void setUp() throws Exception { + super.setUp(); + conf = new Configuration(); + //set the configuration parameter + conf.set(CONF_TEST_KEY, CONF_TEST_VALUE); + } + + /** Dummy class for testing {@link GenericWritable} */ + public static class Foo implements Writable { + private String foo = "foo"; + public void readFields(DataInput in) throws IOException { + foo = Text.readString(in); + } + public void write(DataOutput out) throws IOException { + Text.writeString(out, foo); + } + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Foo)) + return false; + return this.foo.equals(((Foo)obj).foo); + } + } + /** Dummy class for testing {@link GenericWritable} */ + public static class Bar implements Writable, Configurable { + private int bar = 42; //The Answer to The Ultimate Question Of Life, the Universe and Everything + private Configuration conf = null; + public void readFields(DataInput in) throws IOException { + bar = in.readInt(); + } + public void write(DataOutput out) throws IOException { + out.writeInt(bar); + } + public Configuration getConf() { + return conf; + } + public void setConf(Configuration conf) { + this.conf = conf; + } + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Bar)) + return false; + return this.bar == ((Bar)obj).bar; + } + } + + /** Dummy class for testing {@link GenericWritable} */ + public static class Baz extends Bar { + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + //needs a configuration parameter + assertEquals("Configuration is not set for the wrapped object", + CONF_TEST_VALUE, getConf().get(CONF_TEST_KEY)); + } + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + } + } + + /** Dummy class for testing {@link GenericWritable} */ + public static class FooGenericWritable extends GenericWritable { + @Override + @SuppressWarnings("unchecked") + protected Class[] getTypes() { + return new Class[] {Foo.class, Bar.class, Baz.class}; + } + @Override + public boolean equals(Object obj) { + if(! (obj instanceof FooGenericWritable)) + return false; + return get().equals(((FooGenericWritable)obj).get()); + } + } + + public void testFooWritable() throws Exception { + System.out.println("Testing Writable wrapped in GenericWritable"); + FooGenericWritable generic = new FooGenericWritable(); + generic.setConf(conf); + Foo foo = new Foo(); + generic.set(foo); + TestWritable.testWritable(generic); + } + + public void testBarWritable() throws Exception { + System.out.println("Testing Writable, Configurable wrapped in GenericWritable"); + FooGenericWritable generic = new FooGenericWritable(); + generic.setConf(conf); + Bar bar = new Bar(); + bar.setConf(conf); + generic.set(bar); + + //test writing generic writable + FooGenericWritable after + = (FooGenericWritable)TestWritable.testWritable(generic, conf); + + //test configuration + System.out.println("Testing if Configuration is passed to wrapped classes"); + assertTrue(after.get() instanceof Configurable); + assertNotNull(((Configurable)after.get()).getConf()); + } + + public void testBazWritable() throws Exception { + System.out.println("Testing for GenericWritable to find class names"); + FooGenericWritable generic = new FooGenericWritable(); + generic.setConf(conf); + Baz baz = new Baz(); + generic.set(baz); + TestWritable.testWritable(generic, conf); + } + + public void testSet() throws Exception { + Foo foo = new Foo(); + FooGenericWritable generic = new FooGenericWritable(); + //exception should not occur + generic.set(foo); + + try { + //exception should occur, since IntWritable is not registered + generic = new FooGenericWritable(); + generic.set(new IntWritable(1)); + fail("Generic writable should have thrown an exception for a Writable not registered"); + }catch (RuntimeException e) { + //ignore + } + + } + + public void testGet() throws Exception { + Foo foo = new Foo(); + FooGenericWritable generic = new FooGenericWritable(); + generic.set(foo); + assertEquals(foo, generic.get()); + } + +} diff --git a/src/test/org/apache/hadoop/io/TestMD5Hash.java b/src/test/org/apache/hadoop/io/TestMD5Hash.java new file mode 100644 index 00000000000..feb1107ed46 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestMD5Hash.java @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import org.apache.hadoop.io.TestWritable; +import junit.framework.TestCase; +import java.security.MessageDigest; +import java.util.Random; + +/** Unit tests for MD5Hash. */ +public class TestMD5Hash extends TestCase { + public TestMD5Hash(String name) { super(name); } + + private static final Random RANDOM = new Random(); + + public static MD5Hash getTestHash() throws Exception { + MessageDigest digest = MessageDigest.getInstance("MD5"); + byte[] buffer = new byte[1024]; + RANDOM.nextBytes(buffer); + digest.update(buffer); + return new MD5Hash(digest.digest()); + } + + protected static byte[] D00 = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + protected static byte[] DFF = new byte[] {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + + public void testMD5Hash() throws Exception { + MD5Hash md5Hash = getTestHash(); + + final MD5Hash md5Hash00 + = new MD5Hash(D00); + + final MD5Hash md5HashFF + = new MD5Hash(DFF); + + MD5Hash orderedHash = new MD5Hash(new byte[]{1,2,3,4,5,6,7,8,9,10,11,12, + 13,14,15,16}); + MD5Hash backwardHash = new MD5Hash(new byte[]{-1,-2,-3,-4,-5,-6,-7,-8, + -9,-10,-11,-12, -13, -14, + -15,-16}); + MD5Hash closeHash1 = new MD5Hash(new byte[]{-1,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0}); + MD5Hash closeHash2 = new MD5Hash(new byte[]{-1,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0}); + + // test i/o + TestWritable.testWritable(md5Hash); + TestWritable.testWritable(md5Hash00); + TestWritable.testWritable(md5HashFF); + + // test equals() + assertEquals(md5Hash, md5Hash); + assertEquals(md5Hash00, md5Hash00); + assertEquals(md5HashFF, md5HashFF); + + // test compareTo() + assertTrue(md5Hash.compareTo(md5Hash) == 0); + assertTrue(md5Hash00.compareTo(md5Hash) < 0); + assertTrue(md5HashFF.compareTo(md5Hash) > 0); + + // test toString and string ctor + assertEquals(md5Hash, new MD5Hash(md5Hash.toString())); + assertEquals(md5Hash00, new MD5Hash(md5Hash00.toString())); + assertEquals(md5HashFF, new MD5Hash(md5HashFF.toString())); + + assertEquals(0x01020304, orderedHash.quarterDigest()); + assertEquals(0xfffefdfc, backwardHash.quarterDigest()); + + assertEquals(0x0102030405060708L, orderedHash.halfDigest()); + assertEquals(0xfffefdfcfbfaf9f8L, backwardHash.halfDigest()); + assertTrue("hash collision", + closeHash1.hashCode() != closeHash2.hashCode()); + + Thread t1 = new Thread() { + public void run() { + for (int i = 0; i < 100; i++) { + MD5Hash hash = new MD5Hash(DFF); + assertEquals(hash, md5HashFF); + } + } + }; + + Thread t2 = new Thread() { + public void run() { + for (int i = 0; i < 100; i++) { + MD5Hash hash = new MD5Hash(D00); + assertEquals(hash, md5Hash00); + } + } + }; + + t1.start(); + t2.start(); + t1.join(); + t2.join(); + + } + +} diff --git a/src/test/org/apache/hadoop/io/TestMapFile.java b/src/test/org/apache/hadoop/io/TestMapFile.java new file mode 100644 index 00000000000..f006d4f4013 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestMapFile.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.io; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import junit.framework.TestCase; + +public class TestMapFile extends TestCase { + private static Configuration conf = new Configuration(); + + /** + * Test getClosest feature. + * @throws Exception + */ + public void testGetClosest() throws Exception { + // Write a mapfile of simple data: keys are + Path dirName = new Path(System.getProperty("test.build.data",".") + + getName() + ".mapfile"); + FileSystem fs = FileSystem.getLocal(conf); + Path qualifiedDirName = fs.makeQualified(dirName); + // Make an index entry for every third insertion. + MapFile.Writer.setIndexInterval(conf, 3); + MapFile.Writer writer = new MapFile.Writer(conf, fs, + qualifiedDirName.toString(), Text.class, Text.class); + // Assert that the index interval is 1 + assertEquals(3, writer.getIndexInterval()); + // Add entries up to 100 in intervals of ten. + final int FIRST_KEY = 10; + for (int i = FIRST_KEY; i < 100; i += 10) { + String iStr = Integer.toString(i); + Text t = new Text("00".substring(iStr.length()) + iStr); + writer.append(t, t); + } + writer.close(); + // Now do getClosest on created mapfile. + MapFile.Reader reader = new MapFile.Reader(fs, qualifiedDirName.toString(), + conf); + Text key = new Text("55"); + Text value = new Text(); + Text closest = (Text)reader.getClosest(key, value); + // Assert that closest after 55 is 60 + assertEquals(new Text("60"), closest); + // Get closest that falls before the passed key: 50 + closest = (Text)reader.getClosest(key, value, true); + assertEquals(new Text("50"), closest); + // Test get closest when we pass explicit key + final Text TWENTY = new Text("20"); + closest = (Text)reader.getClosest(TWENTY, value); + assertEquals(TWENTY, closest); + closest = (Text)reader.getClosest(TWENTY, value, true); + assertEquals(TWENTY, closest); + // Test what happens at boundaries. Assert if searching a key that is + // less than first key in the mapfile, that the first key is returned. + key = new Text("00"); + closest = (Text)reader.getClosest(key, value); + assertEquals(FIRST_KEY, Integer.parseInt(closest.toString())); + + // If we're looking for the first key before, and we pass in a key before + // the first key in the file, we should get null + closest = (Text)reader.getClosest(key, value, true); + assertNull(closest); + + // Assert that null is returned if key is > last entry in mapfile. + key = new Text("99"); + closest = (Text)reader.getClosest(key, value); + assertNull(closest); + + // If we were looking for the key before, we should get the last key + closest = (Text)reader.getClosest(key, value, true); + assertEquals(new Text("90"), closest); + } + + public void testMidKey() throws Exception { + // Write a mapfile of simple data: keys are + Path dirName = new Path(System.getProperty("test.build.data",".") + + getName() + ".mapfile"); + FileSystem fs = FileSystem.getLocal(conf); + Path qualifiedDirName = fs.makeQualified(dirName); + + MapFile.Writer writer = new MapFile.Writer(conf, fs, + qualifiedDirName.toString(), IntWritable.class, IntWritable.class); + writer.append(new IntWritable(1), new IntWritable(1)); + writer.close(); + // Now do getClosest on created mapfile. + MapFile.Reader reader = new MapFile.Reader(fs, qualifiedDirName.toString(), + conf); + assertEquals(new IntWritable(1), reader.midKey()); + } + + + public void testMidKeyEmpty() throws Exception { + // Write a mapfile of simple data: keys are + Path dirName = new Path(System.getProperty("test.build.data",".") + + getName() + ".mapfile"); + FileSystem fs = FileSystem.getLocal(conf); + Path qualifiedDirName = fs.makeQualified(dirName); + + MapFile.Writer writer = new MapFile.Writer(conf, fs, + qualifiedDirName.toString(), IntWritable.class, IntWritable.class); + writer.close(); + // Now do getClosest on created mapfile. + MapFile.Reader reader = new MapFile.Reader(fs, qualifiedDirName.toString(), + conf); + assertEquals(null, reader.midKey()); + } +} diff --git a/src/test/org/apache/hadoop/io/TestMapWritable.java b/src/test/org/apache/hadoop/io/TestMapWritable.java new file mode 100644 index 00000000000..3d8c4ab3c20 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestMapWritable.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.io; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.util.Map; + +import junit.framework.TestCase; + +/** + * Tests MapWritable + */ +public class TestMapWritable extends TestCase { + /** the test */ + @SuppressWarnings("unchecked") + public void testMapWritable() { + Text[] keys = { + new Text("key1"), + new Text("key2"), + new Text("Key3"), + }; + + BytesWritable[] values = { + new BytesWritable("value1".getBytes()), + new BytesWritable("value2".getBytes()), + new BytesWritable("value3".getBytes()) + }; + + MapWritable inMap = new MapWritable(); + for (int i = 0; i < keys.length; i++) { + inMap.put(keys[i], values[i]); + } + + MapWritable outMap = new MapWritable(inMap); + assertEquals(inMap.size(), outMap.size()); + + for (Map.Entry e: inMap.entrySet()) { + assertTrue(outMap.containsKey(e.getKey())); + assertEquals(0, ((WritableComparable) outMap.get(e.getKey())).compareTo( + e.getValue())); + } + + // Now for something a little harder... + + Text[] maps = { + new Text("map1"), + new Text("map2") + }; + + MapWritable mapOfMaps = new MapWritable(); + mapOfMaps.put(maps[0], inMap); + mapOfMaps.put(maps[1], outMap); + + MapWritable copyOfMapOfMaps = new MapWritable(mapOfMaps); + for (int i = 0; i < maps.length; i++) { + assertTrue(copyOfMapOfMaps.containsKey(maps[i])); + MapWritable a = (MapWritable) mapOfMaps.get(maps[i]); + MapWritable b = (MapWritable) copyOfMapOfMaps.get(maps[i]); + assertEquals(a.size(), b.size()); + for (Writable key: a.keySet()) { + assertTrue(b.containsKey(key)); + + // This will work because we know what we put into each set + + WritableComparable aValue = (WritableComparable) a.get(key); + WritableComparable bValue = (WritableComparable) b.get(key); + assertEquals(0, aValue.compareTo(bValue)); + } + } + } + + /** + * Test that number of "unknown" classes is propagated across multiple copies. + */ + @SuppressWarnings("deprecation") + public void testForeignClass() { + MapWritable inMap = new MapWritable(); + inMap.put(new Text("key"), new UTF8("value")); + inMap.put(new Text("key2"), new UTF8("value2")); + MapWritable outMap = new MapWritable(inMap); + MapWritable copyOfCopy = new MapWritable(outMap); + assertEquals(1, copyOfCopy.getNewClasses()); + } + + /** + * Assert MapWritable does not grow across calls to readFields. + * @throws Exception + * @see HADOOP-2244 + */ + public void testMultipleCallsToReadFieldsAreSafe() throws Exception { + // Create an instance and add a key/value. + MapWritable m = new MapWritable(); + final Text t = new Text(getName()); + m.put(t, t); + // Get current size of map. Key values are 't'. + int count = m.size(); + // Now serialize... save off the bytes. + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos); + m.write(dos); + dos.close(); + // Now add new values to the MapWritable. + m.put(new Text("key1"), new Text("value1")); + m.put(new Text("key2"), new Text("value2")); + // Now deserialize the original MapWritable. Ensure count and key values + // match original state. + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + DataInputStream dis = new DataInputStream(bais); + m.readFields(dis); + assertEquals(count, m.size()); + assertTrue(m.get(t).equals(t)); + dis.close(); + } +} diff --git a/src/test/org/apache/hadoop/io/TestSequenceFileSerialization.java b/src/test/org/apache/hadoop/io/TestSequenceFileSerialization.java new file mode 100644 index 00000000000..c9fc1eae4f5 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestSequenceFileSerialization.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile.Reader; +import org.apache.hadoop.io.SequenceFile.Writer; + +public class TestSequenceFileSerialization extends TestCase { + + private Configuration conf; + private FileSystem fs; + + @Override + protected void setUp() throws Exception { + conf = new Configuration(); + conf.set("io.serializations", + "org.apache.hadoop.io.serializer.JavaSerialization"); + fs = FileSystem.getLocal(conf); + } + + @Override + protected void tearDown() throws Exception { + fs.close(); + } + + public void testJavaSerialization() throws Exception { + Path file = new Path(System.getProperty("test.build.data",".") + + "/test.seq"); + + fs.delete(file, true); + Writer writer = SequenceFile.createWriter(fs, conf, file, Long.class, + String.class); + + writer.append(1L, "one"); + writer.append(2L, "two"); + + writer.close(); + + Reader reader = new Reader(fs, file, conf); + assertEquals(1L, reader.next((Object) null)); + assertEquals("one", reader.getCurrentValue((Object) null)); + assertEquals(2L, reader.next((Object) null)); + assertEquals("two", reader.getCurrentValue((Object) null)); + assertNull(reader.next((Object) null)); + reader.close(); + + } +} diff --git a/src/test/org/apache/hadoop/io/TestSetFile.java b/src/test/org/apache/hadoop/io/TestSetFile.java new file mode 100644 index 00000000000..70d02e013f0 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestSetFile.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.io.*; +import java.util.*; +import junit.framework.TestCase; + +import org.apache.commons.logging.*; + +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.io.SequenceFile.CompressionType; + +/** Support for flat files of binary key/value pairs. */ +public class TestSetFile extends TestCase { + private static final Log LOG = LogFactory.getLog(TestSetFile.class); + private static String FILE = + System.getProperty("test.build.data",".") + "/test.set"; + + private static Configuration conf = new Configuration(); + + public TestSetFile(String name) { super(name); } + + public void testSetFile() throws Exception { + FileSystem fs = FileSystem.getLocal(conf); + try { + RandomDatum[] data = generate(10000); + writeTest(fs, data, FILE, CompressionType.NONE); + readTest(fs, data, FILE); + + writeTest(fs, data, FILE, CompressionType.BLOCK); + readTest(fs, data, FILE); + } finally { + fs.close(); + } + } + + private static RandomDatum[] generate(int count) { + LOG.info("generating " + count + " records in memory"); + RandomDatum[] data = new RandomDatum[count]; + RandomDatum.Generator generator = new RandomDatum.Generator(); + for (int i = 0; i < count; i++) { + generator.next(); + data[i] = generator.getValue(); + } + LOG.info("sorting " + count + " records"); + Arrays.sort(data); + return data; + } + + private static void writeTest(FileSystem fs, RandomDatum[] data, + String file, CompressionType compress) + throws IOException { + MapFile.delete(fs, file); + LOG.info("creating with " + data.length + " records"); + SetFile.Writer writer = + new SetFile.Writer(conf, fs, file, + WritableComparator.get(RandomDatum.class), + compress); + for (int i = 0; i < data.length; i++) + writer.append(data[i]); + writer.close(); + } + + private static void readTest(FileSystem fs, RandomDatum[] data, String file) + throws IOException { + RandomDatum v = new RandomDatum(); + int sample = (int)Math.sqrt(data.length); + Random random = new Random(); + LOG.info("reading " + sample + " records"); + SetFile.Reader reader = new SetFile.Reader(fs, file, conf); + for (int i = 0; i < sample; i++) { + if (!reader.seek(data[random.nextInt(data.length)])) + throw new RuntimeException("wrong value at " + i); + } + reader.close(); + LOG.info("done reading " + data.length); + } + + + /** For debugging and testing. */ + public static void main(String[] args) throws Exception { + int count = 1024 * 1024; + boolean create = true; + boolean check = true; + String file = FILE; + String compress = "NONE"; + + String usage = "Usage: TestSetFile [-count N] [-nocreate] [-nocheck] [-compress type] file"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + int i = 0; + Path fpath=null; + FileSystem fs = null; + try { + for (; i < args.length; i++) { // parse command line + if (args[i] == null) { + continue; + } else if (args[i].equals("-count")) { + count = Integer.parseInt(args[++i]); + } else if (args[i].equals("-nocreate")) { + create = false; + } else if (args[i].equals("-nocheck")) { + check = false; + } else if (args[i].equals("-compress")) { + compress = args[++i]; + } else { + // file is required parameter + file = args[i]; + fpath=new Path(file); + } + } + + fs = fpath.getFileSystem(conf); + + LOG.info("count = " + count); + LOG.info("create = " + create); + LOG.info("check = " + check); + LOG.info("compress = " + compress); + LOG.info("file = " + file); + + RandomDatum[] data = generate(count); + + if (create) { + writeTest(fs, data, file, CompressionType.valueOf(compress)); + } + + if (check) { + readTest(fs, data, file); + } + + } finally { + fs.close(); + } + } +} diff --git a/src/test/org/apache/hadoop/io/TestSortedMapWritable.java b/src/test/org/apache/hadoop/io/TestSortedMapWritable.java new file mode 100644 index 00000000000..927bfc1f42d --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestSortedMapWritable.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.io; + +import java.util.Map; + +import junit.framework.TestCase; + +/** + * Tests SortedMapWritable + */ +public class TestSortedMapWritable extends TestCase { + /** the test */ + @SuppressWarnings("unchecked") + public void testSortedMapWritable() { + Text[] keys = { + new Text("key1"), + new Text("key2"), + new Text("key3"), + }; + + BytesWritable[] values = { + new BytesWritable("value1".getBytes()), + new BytesWritable("value2".getBytes()), + new BytesWritable("value3".getBytes()) + }; + + SortedMapWritable inMap = new SortedMapWritable(); + for (int i = 0; i < keys.length; i++) { + inMap.put(keys[i], values[i]); + } + + assertEquals(0, inMap.firstKey().compareTo(keys[0])); + assertEquals(0, inMap.lastKey().compareTo(keys[2])); + + SortedMapWritable outMap = new SortedMapWritable(inMap); + assertEquals(inMap.size(), outMap.size()); + + for (Map.Entry e: inMap.entrySet()) { + assertTrue(outMap.containsKey(e.getKey())); + assertEquals(0, ((WritableComparable) outMap.get(e.getKey())).compareTo( + e.getValue())); + } + + // Now for something a little harder... + + Text[] maps = { + new Text("map1"), + new Text("map2") + }; + + SortedMapWritable mapOfMaps = new SortedMapWritable(); + mapOfMaps.put(maps[0], inMap); + mapOfMaps.put(maps[1], outMap); + + SortedMapWritable copyOfMapOfMaps = new SortedMapWritable(mapOfMaps); + for (int i = 0; i < maps.length; i++) { + assertTrue(copyOfMapOfMaps.containsKey(maps[i])); + + SortedMapWritable a = (SortedMapWritable) mapOfMaps.get(maps[i]); + SortedMapWritable b = (SortedMapWritable) copyOfMapOfMaps.get(maps[i]); + assertEquals(a.size(), b.size()); + for (Writable key: a.keySet()) { + assertTrue(b.containsKey(key)); + + // This will work because we know what we put into each set + + WritableComparable aValue = (WritableComparable) a.get(key); + WritableComparable bValue = (WritableComparable) b.get(key); + assertEquals(0, aValue.compareTo(bValue)); + } + } + } + + /** + * Test that number of "unknown" classes is propagated across multiple copies. + */ + @SuppressWarnings("deprecation") + public void testForeignClass() { + SortedMapWritable inMap = new SortedMapWritable(); + inMap.put(new Text("key"), new UTF8("value")); + inMap.put(new Text("key2"), new UTF8("value2")); + SortedMapWritable outMap = new SortedMapWritable(inMap); + SortedMapWritable copyOfCopy = new SortedMapWritable(outMap); + assertEquals(1, copyOfCopy.getNewClasses()); + } +} diff --git a/src/test/org/apache/hadoop/io/TestText.java b/src/test/org/apache/hadoop/io/TestText.java new file mode 100644 index 00000000000..6e004860991 --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestText.java @@ -0,0 +1,266 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import junit.framework.TestCase; + +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.util.Random; + +/** Unit tests for LargeUTF8. */ +public class TestText extends TestCase { + private static final int NUM_ITERATIONS = 100; + public TestText(String name) { super(name); } + + private static final Random RANDOM = new Random(1); + + private static final int RAND_LEN = -1; + + // generate a valid java String + private static String getTestString(int len) throws Exception { + StringBuffer buffer = new StringBuffer(); + int length = (len==RAND_LEN) ? RANDOM.nextInt(1000) : len; + while (buffer.length() test = WritableName.getClass("long",conf); + assertTrue(test != null); + } + + public void testSetName() throws Exception { + Configuration conf = new Configuration(); + WritableName.setName(SimpleWritable.class, testName); + + Class test = WritableName.getClass(testName,conf); + assertTrue(test.equals(SimpleWritable.class)); + } + + + public void testAddName() throws Exception { + Configuration conf = new Configuration(); + String altName = testName + ".alt"; + + WritableName.addName(SimpleWritable.class, altName); + + Class test = WritableName.getClass(altName, conf); + assertTrue(test.equals(SimpleWritable.class)); + + // check original name still works + test = WritableName.getClass(testName, conf); + assertTrue(test.equals(SimpleWritable.class)); + + } + + public void testBadName() throws Exception { + Configuration conf = new Configuration(); + try { + Class test = WritableName.getClass("unknown_junk",conf); + assertTrue(false); + } catch(IOException e) { + assertTrue(e.getMessage().matches(".*unknown_junk.*")); + } + } + +} diff --git a/src/test/org/apache/hadoop/io/TestWritableUtils.java b/src/test/org/apache/hadoop/io/TestWritableUtils.java new file mode 100644 index 00000000000..2487fc0612c --- /dev/null +++ b/src/test/org/apache/hadoop/io/TestWritableUtils.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import junit.framework.TestCase; + +public class TestWritableUtils extends TestCase { + private static final Log LOG = LogFactory.getLog(TestWritableUtils.class); + + public static void testValue(int val, int vintlen) throws IOException { + DataOutputBuffer buf = new DataOutputBuffer(); + DataInputBuffer inbuf = new DataInputBuffer(); + WritableUtils.writeVInt(buf, val); + if (LOG.isDebugEnabled()) { + LOG.debug("Value = " + val); + BytesWritable printer = new BytesWritable(); + printer.set(buf.getData(), 0, buf.getLength()); + LOG.debug("Buffer = " + printer); + } + inbuf.reset(buf.getData(), 0, buf.getLength()); + assertEquals(val, WritableUtils.readVInt(inbuf)); + assertEquals(vintlen, buf.getLength()); + assertEquals(vintlen, WritableUtils.getVIntSize(val)); + assertEquals(vintlen, WritableUtils.decodeVIntSize(buf.getData()[0])); + } + + public static void testVInt() throws Exception { + testValue(12, 1); + testValue(127, 1); + testValue(-112, 1); + testValue(-113, 2); + testValue(-128, 2); + testValue(128, 2); + testValue(-129, 2); + testValue(255, 2); + testValue(-256, 2); + testValue(256, 3); + testValue(-257, 3); + testValue(65535, 3); + testValue(-65536, 3); + testValue(65536, 4); + testValue(-65537, 4); + } +} diff --git a/src/test/org/apache/hadoop/io/compress/TestCodec.java b/src/test/org/apache/hadoop/io/compress/TestCodec.java new file mode 100644 index 00000000000..38e4a358376 --- /dev/null +++ b/src/test/org/apache/hadoop/io/compress/TestCodec.java @@ -0,0 +1,249 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.io.compress; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataInputBuffer; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.RandomDatum; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.compress.CompressionOutputStream; +import org.apache.hadoop.io.compress.zlib.ZlibFactory; + +public class TestCodec extends TestCase { + + private static final Log LOG= + LogFactory.getLog(TestCodec.class); + + private Configuration conf = new Configuration(); + private int count = 10000; + private int seed = new Random().nextInt(); + + public void testDefaultCodec() throws IOException { + codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.DefaultCodec"); + codecTest(conf, seed, count, "org.apache.hadoop.io.compress.DefaultCodec"); + } + + public void testGzipCodec() throws IOException { + codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.GzipCodec"); + codecTest(conf, seed, count, "org.apache.hadoop.io.compress.GzipCodec"); + } + + public void testBZip2Codec() throws IOException { + codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.BZip2Codec"); + codecTest(conf, seed, count, "org.apache.hadoop.io.compress.BZip2Codec"); + } + + private static void codecTest(Configuration conf, int seed, int count, + String codecClass) + throws IOException { + + // Create the codec + CompressionCodec codec = null; + try { + codec = (CompressionCodec) + ReflectionUtils.newInstance(conf.getClassByName(codecClass), conf); + } catch (ClassNotFoundException cnfe) { + throw new IOException("Illegal codec!"); + } + LOG.info("Created a Codec object of type: " + codecClass); + + // Generate data + DataOutputBuffer data = new DataOutputBuffer(); + RandomDatum.Generator generator = new RandomDatum.Generator(seed); + for(int i=0; i < count; ++i) { + generator.next(); + RandomDatum key = generator.getKey(); + RandomDatum value = generator.getValue(); + + key.write(data); + value.write(data); + } + DataInputBuffer originalData = new DataInputBuffer(); + DataInputStream originalIn = new DataInputStream(new BufferedInputStream(originalData)); + originalData.reset(data.getData(), 0, data.getLength()); + + LOG.info("Generated " + count + " records"); + + // Compress data + DataOutputBuffer compressedDataBuffer = new DataOutputBuffer(); + CompressionOutputStream deflateFilter = + codec.createOutputStream(compressedDataBuffer); + DataOutputStream deflateOut = + new DataOutputStream(new BufferedOutputStream(deflateFilter)); + deflateOut.write(data.getData(), 0, data.getLength()); + deflateOut.flush(); + deflateFilter.finish(); + LOG.info("Finished compressing data"); + + // De-compress data + DataInputBuffer deCompressedDataBuffer = new DataInputBuffer(); + deCompressedDataBuffer.reset(compressedDataBuffer.getData(), 0, + compressedDataBuffer.getLength()); + CompressionInputStream inflateFilter = + codec.createInputStream(deCompressedDataBuffer); + DataInputStream inflateIn = + new DataInputStream(new BufferedInputStream(inflateFilter)); + + // Check + for(int i=0; i < count; ++i) { + RandomDatum k1 = new RandomDatum(); + RandomDatum v1 = new RandomDatum(); + k1.readFields(originalIn); + v1.readFields(originalIn); + + RandomDatum k2 = new RandomDatum(); + RandomDatum v2 = new RandomDatum(); + k2.readFields(inflateIn); + v2.readFields(inflateIn); + } + LOG.info("SUCCESS! Completed checking " + count + " records"); + } + + public void testCodecPoolGzipReuse() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean("hadoop.native.lib", true); + if (!ZlibFactory.isNativeZlibLoaded(conf)) { + LOG.warn("testCodecPoolGzipReuse skipped: native libs not loaded"); + return; + } + GzipCodec gzc = ReflectionUtils.newInstance(GzipCodec.class, conf); + DefaultCodec dfc = ReflectionUtils.newInstance(DefaultCodec.class, conf); + Compressor c1 = CodecPool.getCompressor(gzc); + Compressor c2 = CodecPool.getCompressor(dfc); + CodecPool.returnCompressor(c1); + CodecPool.returnCompressor(c2); + assertTrue("Got mismatched ZlibCompressor", c2 != CodecPool.getCompressor(gzc)); + } + + public void testSequenceFileDefaultCodec() throws IOException, ClassNotFoundException, + InstantiationException, IllegalAccessException { + sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.DefaultCodec", 100); + sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.DefaultCodec", 1000000); + } + + public void testSequenceFileBZip2Codec() throws IOException, ClassNotFoundException, + InstantiationException, IllegalAccessException { + sequenceFileCodecTest(conf, 0, "org.apache.hadoop.io.compress.BZip2Codec", 100); + sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.BZip2Codec", 100); + sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.BZip2Codec", 1000000); + } + + private static void sequenceFileCodecTest(Configuration conf, int lines, + String codecClass, int blockSize) + throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { + + Path filePath = new Path("SequenceFileCodecTest." + codecClass); + // Configuration + conf.setInt("io.seqfile.compress.blocksize", blockSize); + + // Create the SequenceFile + FileSystem fs = FileSystem.get(conf); + LOG.info("Creating SequenceFile with codec \"" + codecClass + "\""); + SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, filePath, + Text.class, Text.class, CompressionType.BLOCK, + (CompressionCodec)Class.forName(codecClass).newInstance()); + + // Write some data + LOG.info("Writing to SequenceFile..."); + for (int i=0; i getCompressorType() { + return null; + } + + public Compressor createCompressor() { + return null; + } + + public CompressionInputStream createInputStream(InputStream in, + Decompressor decompressor) + throws IOException { + return null; + } + + public CompressionInputStream createInputStream(InputStream in) + throws IOException { + return null; + } + + public CompressionOutputStream createOutputStream(OutputStream out, + Compressor compressor) + throws IOException { + return null; + } + + public Class getDecompressorType() { + return null; + } + + public Decompressor createDecompressor() { + return null; + } + + public String getDefaultExtension() { + return ".base"; + } + } + + private static class BarCodec extends BaseCodec { + public String getDefaultExtension() { + return "bar"; + } + } + + private static class FooBarCodec extends BaseCodec { + public String getDefaultExtension() { + return ".foo.bar"; + } + } + + private static class FooCodec extends BaseCodec { + public String getDefaultExtension() { + return ".foo"; + } + } + + /** + * Returns a factory for a given set of codecs + * @param classes the codec classes to include + * @return a new factory + */ + private static CompressionCodecFactory setClasses(Class[] classes) { + Configuration conf = new Configuration(); + CompressionCodecFactory.setCodecClasses(conf, Arrays.asList(classes)); + return new CompressionCodecFactory(conf); + } + + private static void checkCodec(String msg, + Class expected, CompressionCodec actual) { + assertEquals(msg + " unexpected codec found", + expected.getName(), + actual.getClass().getName()); + } + + public static void testFinding() { + CompressionCodecFactory factory = + new CompressionCodecFactory(new Configuration()); + CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar")); + assertEquals("default factory foo codec", null, codec); + codec = factory.getCodec(new Path("/tmp/foo.gz")); + checkCodec("default factory for .gz", GzipCodec.class, codec); + codec = factory.getCodec(new Path("/tmp/foo.bz2")); + checkCodec("default factory for .bz2", BZip2Codec.class, codec); + factory = setClasses(new Class[0]); + codec = factory.getCodec(new Path("/tmp/foo.bar")); + assertEquals("empty codec bar codec", null, codec); + codec = factory.getCodec(new Path("/tmp/foo.gz")); + assertEquals("empty codec gz codec", null, codec); + codec = factory.getCodec(new Path("/tmp/foo.bz2")); + assertEquals("default factory for .bz2", null, codec); + factory = setClasses(new Class[]{BarCodec.class, FooCodec.class, + FooBarCodec.class}); + codec = factory.getCodec(new Path("/tmp/.foo.bar.gz")); + assertEquals("full factory gz codec", null, codec); + codec = factory.getCodec(new Path("/tmp/foo.bz2")); + assertEquals("default factory for .bz2", null, codec); + codec = factory.getCodec(new Path("/tmp/foo.bar")); + checkCodec("full factory bar codec", BarCodec.class, codec); + codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar")); + checkCodec("full factory foo bar codec", FooBarCodec.class, codec); + codec = factory.getCodec(new Path("/tmp/foo.foo")); + checkCodec("full factory foo codec", FooCodec.class, codec); + } +} diff --git a/src/test/org/apache/hadoop/io/retry/TestRetryProxy.java b/src/test/org/apache/hadoop/io/retry/TestRetryProxy.java new file mode 100644 index 00000000000..c48e87b7dd9 --- /dev/null +++ b/src/test/org/apache/hadoop/io/retry/TestRetryProxy.java @@ -0,0 +1,170 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.retry; + +import static org.apache.hadoop.io.retry.RetryPolicies.RETRY_FOREVER; +import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_DONT_FAIL; +import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_THEN_FAIL; +import static org.apache.hadoop.io.retry.RetryPolicies.retryByException; +import static org.apache.hadoop.io.retry.RetryPolicies.retryByRemoteException; +import static org.apache.hadoop.io.retry.RetryPolicies.retryUpToMaximumCountWithFixedSleep; +import static org.apache.hadoop.io.retry.RetryPolicies.retryUpToMaximumCountWithProportionalSleep; +import static org.apache.hadoop.io.retry.RetryPolicies.retryUpToMaximumTimeWithFixedSleep; +import static org.apache.hadoop.io.retry.RetryPolicies.exponentialBackoffRetry; + +import java.util.Collections; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import junit.framework.TestCase; + +import org.apache.hadoop.io.retry.UnreliableInterface.FatalException; +import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException; +import org.apache.hadoop.ipc.RemoteException; + +public class TestRetryProxy extends TestCase { + + private UnreliableImplementation unreliableImpl; + + @Override + protected void setUp() throws Exception { + unreliableImpl = new UnreliableImplementation(); + } + + public void testTryOnceThenFail() throws UnreliableException { + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, TRY_ONCE_THEN_FAIL); + unreliable.alwaysSucceeds(); + try { + unreliable.failsOnceThenSucceeds(); + fail("Should fail"); + } catch (UnreliableException e) { + // expected + } + } + + public void testTryOnceDontFail() throws UnreliableException { + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, TRY_ONCE_DONT_FAIL); + unreliable.alwaysSucceeds(); + unreliable.failsOnceThenSucceeds(); + try { + unreliable.failsOnceThenSucceedsWithReturnValue(); + fail("Should fail"); + } catch (UnreliableException e) { + // expected + } + } + + public void testRetryForever() throws UnreliableException { + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, RETRY_FOREVER); + unreliable.alwaysSucceeds(); + unreliable.failsOnceThenSucceeds(); + unreliable.failsTenTimesThenSucceeds(); + } + + public void testRetryUpToMaximumCountWithFixedSleep() throws UnreliableException { + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, + retryUpToMaximumCountWithFixedSleep(8, 1, TimeUnit.NANOSECONDS)); + unreliable.alwaysSucceeds(); + unreliable.failsOnceThenSucceeds(); + try { + unreliable.failsTenTimesThenSucceeds(); + fail("Should fail"); + } catch (UnreliableException e) { + // expected + } + } + + public void testRetryUpToMaximumTimeWithFixedSleep() throws UnreliableException { + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, + retryUpToMaximumTimeWithFixedSleep(80, 10, TimeUnit.NANOSECONDS)); + unreliable.alwaysSucceeds(); + unreliable.failsOnceThenSucceeds(); + try { + unreliable.failsTenTimesThenSucceeds(); + fail("Should fail"); + } catch (UnreliableException e) { + // expected + } + } + + public void testRetryUpToMaximumCountWithProportionalSleep() throws UnreliableException { + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, + retryUpToMaximumCountWithProportionalSleep(8, 1, TimeUnit.NANOSECONDS)); + unreliable.alwaysSucceeds(); + unreliable.failsOnceThenSucceeds(); + try { + unreliable.failsTenTimesThenSucceeds(); + fail("Should fail"); + } catch (UnreliableException e) { + // expected + } + } + + public void testExponentialRetry() throws UnreliableException { + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, + exponentialBackoffRetry(5, 1L, TimeUnit.NANOSECONDS)); + unreliable.alwaysSucceeds(); + unreliable.failsOnceThenSucceeds(); + try { + unreliable.failsTenTimesThenSucceeds(); + fail("Should fail"); + } catch (UnreliableException e) { + // expected + } + } + + public void testRetryByException() throws UnreliableException { + Map, RetryPolicy> exceptionToPolicyMap = + Collections., RetryPolicy>singletonMap(FatalException.class, TRY_ONCE_THEN_FAIL); + + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, + retryByException(RETRY_FOREVER, exceptionToPolicyMap)); + unreliable.failsOnceThenSucceeds(); + try { + unreliable.alwaysFailsWithFatalException(); + fail("Should fail"); + } catch (FatalException e) { + // expected + } + } + + public void testRetryByRemoteException() throws UnreliableException { + Map, RetryPolicy> exceptionToPolicyMap = + Collections., RetryPolicy>singletonMap(FatalException.class, TRY_ONCE_THEN_FAIL); + + UnreliableInterface unreliable = (UnreliableInterface) + RetryProxy.create(UnreliableInterface.class, unreliableImpl, + retryByRemoteException(RETRY_FOREVER, exceptionToPolicyMap)); + try { + unreliable.alwaysFailsWithRemoteFatalException(); + fail("Should fail"); + } catch (RemoteException e) { + // expected + } + } + +} diff --git a/src/test/org/apache/hadoop/io/retry/UnreliableImplementation.java b/src/test/org/apache/hadoop/io/retry/UnreliableImplementation.java new file mode 100644 index 00000000000..5971ee72165 --- /dev/null +++ b/src/test/org/apache/hadoop/io/retry/UnreliableImplementation.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.retry; + +import org.apache.hadoop.ipc.RemoteException; + +public class UnreliableImplementation implements UnreliableInterface { + + private int failsOnceInvocationCount, + failsOnceWithValueInvocationCount, + failsTenTimesInvocationCount; + + public void alwaysSucceeds() { + // do nothing + } + + public void alwaysFailsWithFatalException() throws FatalException { + throw new FatalException(); + } + + public void alwaysFailsWithRemoteFatalException() throws RemoteException { + throw new RemoteException(FatalException.class.getName(), "Oops"); + } + + public void failsOnceThenSucceeds() throws UnreliableException { + if (failsOnceInvocationCount++ == 0) { + throw new UnreliableException(); + } + } + + public boolean failsOnceThenSucceedsWithReturnValue() throws UnreliableException { + if (failsOnceWithValueInvocationCount++ == 0) { + throw new UnreliableException(); + } + return true; + } + + public void failsTenTimesThenSucceeds() throws UnreliableException { + if (failsTenTimesInvocationCount++ < 10) { + throw new UnreliableException(); + } + } + +} diff --git a/src/test/org/apache/hadoop/io/retry/UnreliableInterface.java b/src/test/org/apache/hadoop/io/retry/UnreliableInterface.java new file mode 100644 index 00000000000..af4959151e7 --- /dev/null +++ b/src/test/org/apache/hadoop/io/retry/UnreliableInterface.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.retry; + +import org.apache.hadoop.ipc.RemoteException; + +public interface UnreliableInterface { + + public static class UnreliableException extends Exception { + // no body + } + + public static class FatalException extends UnreliableException { + // no body + } + + void alwaysSucceeds() throws UnreliableException; + + void alwaysFailsWithFatalException() throws FatalException; + void alwaysFailsWithRemoteFatalException() throws RemoteException; + + void failsOnceThenSucceeds() throws UnreliableException; + boolean failsOnceThenSucceedsWithReturnValue() throws UnreliableException; + + void failsTenTimesThenSucceeds() throws UnreliableException; +} diff --git a/src/test/org/apache/hadoop/io/serializer/TestWritableSerialization.java b/src/test/org/apache/hadoop/io/serializer/TestWritableSerialization.java new file mode 100644 index 00000000000..6a551753245 --- /dev/null +++ b/src/test/org/apache/hadoop/io/serializer/TestWritableSerialization.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.serializer; + +import static org.apache.hadoop.io.TestGenericWritable.CONF_TEST_KEY; +import static org.apache.hadoop.io.TestGenericWritable.CONF_TEST_VALUE; +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.DataInputBuffer; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.TestGenericWritable.Baz; +import org.apache.hadoop.io.TestGenericWritable.FooGenericWritable; +import org.apache.hadoop.util.GenericsUtil; + +public class TestWritableSerialization extends TestCase { + + private static final Configuration conf = new Configuration(); + + static { + conf.set("io.serializations" + , "org.apache.hadoop.io.serializer.WritableSerialization"); + } + + public void testWritableSerialization() throws Exception { + Text before = new Text("test writable"); + testSerialization(conf, before); + } + + + public void testWritableConfigurable() throws Exception { + + //set the configuration parameter + conf.set(CONF_TEST_KEY, CONF_TEST_VALUE); + + //reuse TestGenericWritable inner classes to test + //writables that also implement Configurable. + FooGenericWritable generic = new FooGenericWritable(); + generic.setConf(conf); + Baz baz = new Baz(); + generic.set(baz); + Baz result = testSerialization(conf, baz); + assertNotNull(result.getConf()); + } + + /** + * A utility that tests serialization/deserialization. + * @param the class of the item + * @param conf configuration to use, "io.serializations" is read to + * determine the serialization + * @param before item to (de)serialize + * @return deserialized item + */ + public static K testSerialization(Configuration conf, K before) + throws Exception { + + SerializationFactory factory = new SerializationFactory(conf); + Serializer serializer + = factory.getSerializer(GenericsUtil.getClass(before)); + Deserializer deserializer + = factory.getDeserializer(GenericsUtil.getClass(before)); + + DataOutputBuffer out = new DataOutputBuffer(); + serializer.open(out); + serializer.serialize(before); + serializer.close(); + + DataInputBuffer in = new DataInputBuffer(); + in.reset(out.getData(), out.getLength()); + deserializer.open(in); + K after = deserializer.deserialize(null); + deserializer.close(); + + assertEquals(before, after); + return after; + } + +} diff --git a/src/test/org/apache/hadoop/ipc/TestIPC.java b/src/test/org/apache/hadoop/ipc/TestIPC.java new file mode 100644 index 00000000000..df5a1558153 --- /dev/null +++ b/src/test/org/apache/hadoop/ipc/TestIPC.java @@ -0,0 +1,243 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ipc; + +import org.apache.commons.logging.*; + +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.net.NetUtils; + +import java.util.Random; +import java.io.IOException; +import java.net.InetSocketAddress; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; + +/** Unit tests for IPC. */ +public class TestIPC extends TestCase { + public static final Log LOG = + LogFactory.getLog(TestIPC.class); + + final private static Configuration conf = new Configuration(); + final static private int PING_INTERVAL = 1000; + + static { + Client.setPingInterval(conf, PING_INTERVAL); + } + public TestIPC(String name) { super(name); } + + private static final Random RANDOM = new Random(); + + private static final String ADDRESS = "0.0.0.0"; + + private static class TestServer extends Server { + private boolean sleep; + + public TestServer(int handlerCount, boolean sleep) + throws IOException { + super(ADDRESS, 0, LongWritable.class, handlerCount, conf); + this.sleep = sleep; + } + + @Override + public Writable call(Class protocol, Writable param, long receiveTime) + throws IOException { + if (sleep) { + try { + Thread.sleep(RANDOM.nextInt(2*PING_INTERVAL)); // sleep a bit + } catch (InterruptedException e) {} + } + return param; // echo param as result + } + } + + private static class SerialCaller extends Thread { + private Client client; + private InetSocketAddress server; + private int count; + private boolean failed; + + public SerialCaller(Client client, InetSocketAddress server, int count) { + this.client = client; + this.server = server; + this.count = count; + } + + public void run() { + for (int i = 0; i < count; i++) { + try { + LongWritable param = new LongWritable(RANDOM.nextLong()); + LongWritable value = + (LongWritable)client.call(param, server); + if (!param.equals(value)) { + LOG.fatal("Call failed!"); + failed = true; + break; + } + } catch (Exception e) { + LOG.fatal("Caught: " + StringUtils.stringifyException(e)); + failed = true; + } + } + } + } + + private static class ParallelCaller extends Thread { + private Client client; + private int count; + private InetSocketAddress[] addresses; + private boolean failed; + + public ParallelCaller(Client client, InetSocketAddress[] addresses, + int count) { + this.client = client; + this.addresses = addresses; + this.count = count; + } + + public void run() { + for (int i = 0; i < count; i++) { + try { + Writable[] params = new Writable[addresses.length]; + for (int j = 0; j < addresses.length; j++) + params[j] = new LongWritable(RANDOM.nextLong()); + Writable[] values = client.call(params, addresses); + for (int j = 0; j < addresses.length; j++) { + if (!params[j].equals(values[j])) { + LOG.fatal("Call failed!"); + failed = true; + break; + } + } + } catch (Exception e) { + LOG.fatal("Caught: " + StringUtils.stringifyException(e)); + failed = true; + } + } + } + } + + public void testSerial() throws Exception { + testSerial(3, false, 2, 5, 100); + } + + public void testSerial(int handlerCount, boolean handlerSleep, + int clientCount, int callerCount, int callCount) + throws Exception { + Server server = new TestServer(handlerCount, handlerSleep); + InetSocketAddress addr = NetUtils.getConnectAddress(server); + server.start(); + + Client[] clients = new Client[clientCount]; + for (int i = 0; i < clientCount; i++) { + clients[i] = new Client(LongWritable.class, conf); + } + + SerialCaller[] callers = new SerialCaller[callerCount]; + for (int i = 0; i < callerCount; i++) { + callers[i] = new SerialCaller(clients[i%clientCount], addr, callCount); + callers[i].start(); + } + for (int i = 0; i < callerCount; i++) { + callers[i].join(); + assertFalse(callers[i].failed); + } + for (int i = 0; i < clientCount; i++) { + clients[i].stop(); + } + server.stop(); + } + + public void testParallel() throws Exception { + testParallel(10, false, 2, 4, 2, 4, 100); + } + + public void testParallel(int handlerCount, boolean handlerSleep, + int serverCount, int addressCount, + int clientCount, int callerCount, int callCount) + throws Exception { + Server[] servers = new Server[serverCount]; + for (int i = 0; i < serverCount; i++) { + servers[i] = new TestServer(handlerCount, handlerSleep); + servers[i].start(); + } + + InetSocketAddress[] addresses = new InetSocketAddress[addressCount]; + for (int i = 0; i < addressCount; i++) { + addresses[i] = NetUtils.getConnectAddress(servers[i%serverCount]); + } + + Client[] clients = new Client[clientCount]; + for (int i = 0; i < clientCount; i++) { + clients[i] = new Client(LongWritable.class, conf); + } + + ParallelCaller[] callers = new ParallelCaller[callerCount]; + for (int i = 0; i < callerCount; i++) { + callers[i] = + new ParallelCaller(clients[i%clientCount], addresses, callCount); + callers[i].start(); + } + for (int i = 0; i < callerCount; i++) { + callers[i].join(); + assertFalse(callers[i].failed); + } + for (int i = 0; i < clientCount; i++) { + clients[i].stop(); + } + for (int i = 0; i < serverCount; i++) { + servers[i].stop(); + } + } + + public void testStandAloneClient() throws Exception { + testParallel(10, false, 2, 4, 2, 4, 100); + Client client = new Client(LongWritable.class, conf); + InetSocketAddress address = new InetSocketAddress("127.0.0.1", 10); + try { + client.call(new LongWritable(RANDOM.nextLong()), + address); + fail("Expected an exception to have been thrown"); + } catch (IOException e) { + String message = e.getMessage(); + String addressText = address.toString(); + assertTrue("Did not find "+addressText+" in "+message, + message.contains(addressText)); + Throwable cause=e.getCause(); + assertNotNull("No nested exception in "+e,cause); + String causeText=cause.getMessage(); + assertTrue("Did not find " + causeText + " in " + message, + message.contains(causeText)); + } + } + + + public static void main(String[] args) throws Exception { + + //new TestIPC("test").testSerial(5, false, 2, 10, 1000); + + new TestIPC("test").testParallel(10, false, 2, 4, 2, 4, 1000); + + } + +} diff --git a/src/test/org/apache/hadoop/ipc/TestIPCServerResponder.java b/src/test/org/apache/hadoop/ipc/TestIPCServerResponder.java new file mode 100644 index 00000000000..2591da01432 --- /dev/null +++ b/src/test/org/apache/hadoop/ipc/TestIPCServerResponder.java @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ipc; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.net.NetUtils; + +/** + * This test provokes partial writes in the server, which is + * serving multiple clients. + */ +public class TestIPCServerResponder extends TestCase { + + public static final Log LOG = + LogFactory.getLog(TestIPCServerResponder.class); + + private static Configuration conf = new Configuration(); + + public TestIPCServerResponder(final String name) { + super(name); + } + + private static final Random RANDOM = new Random(); + + private static final String ADDRESS = "0.0.0.0"; + + private static final int BYTE_COUNT = 1024; + private static final byte[] BYTES = new byte[BYTE_COUNT]; + static { + for (int i = 0; i < BYTE_COUNT; i++) + BYTES[i] = (byte) ('a' + (i % 26)); + } + + private static class TestServer extends Server { + + private boolean sleep; + + public TestServer(final int handlerCount, final boolean sleep) + throws IOException { + super(ADDRESS, 0, BytesWritable.class, handlerCount, conf); + // Set the buffer size to half of the maximum parameter/result size + // to force the socket to block + this.setSocketSendBufSize(BYTE_COUNT / 2); + this.sleep = sleep; + } + + @Override + public Writable call(Class protocol, Writable param, long receiveTime) + throws IOException { + if (sleep) { + try { + Thread.sleep(RANDOM.nextInt(20)); // sleep a bit + } catch (InterruptedException e) {} + } + return param; + } + } + + private static class Caller extends Thread { + + private Client client; + private int count; + private InetSocketAddress address; + private boolean failed; + + public Caller(final Client client, final InetSocketAddress address, + final int count) { + this.client = client; + this.address = address; + this.count = count; + } + + @Override + public void run() { + for (int i = 0; i < count; i++) { + try { + int byteSize = RANDOM.nextInt(BYTE_COUNT); + byte[] bytes = new byte[byteSize]; + System.arraycopy(BYTES, 0, bytes, 0, byteSize); + Writable param = new BytesWritable(bytes); + Writable value = client.call(param, address); + Thread.sleep(RANDOM.nextInt(20)); + } catch (Exception e) { + LOG.fatal("Caught: " + e); + failed = true; + } + } + } + } + + public void testServerResponder() throws Exception { + testServerResponder(10, true, 1, 10, 200); + } + + public void testServerResponder(final int handlerCount, + final boolean handlerSleep, + final int clientCount, + final int callerCount, + final int callCount) throws Exception { + Server server = new TestServer(handlerCount, handlerSleep); + server.start(); + + InetSocketAddress address = NetUtils.getConnectAddress(server); + Client[] clients = new Client[clientCount]; + for (int i = 0; i < clientCount; i++) { + clients[i] = new Client(BytesWritable.class, conf); + } + + Caller[] callers = new Caller[callerCount]; + for (int i = 0; i < callerCount; i++) { + callers[i] = new Caller(clients[i % clientCount], address, callCount); + callers[i].start(); + } + for (int i = 0; i < callerCount; i++) { + callers[i].join(); + assertFalse(callers[i].failed); + } + for (int i = 0; i < clientCount; i++) { + clients[i].stop(); + } + server.stop(); + } + +} diff --git a/src/test/org/apache/hadoop/ipc/TestRPC.java b/src/test/org/apache/hadoop/ipc/TestRPC.java new file mode 100644 index 00000000000..d0db263cc1a --- /dev/null +++ b/src/test/org/apache/hadoop/ipc/TestRPC.java @@ -0,0 +1,391 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ipc; + +import java.io.IOException; +import java.net.ConnectException; +import java.net.InetSocketAddress; +import java.lang.reflect.Method; + +import junit.framework.TestCase; + +import java.util.Arrays; + +import org.apache.commons.logging.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.UTF8; +import org.apache.hadoop.io.Writable; + +import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.security.SecurityUtil; +import org.apache.hadoop.security.authorize.AuthorizationException; +import org.apache.hadoop.security.authorize.ConfiguredPolicy; +import org.apache.hadoop.security.authorize.PolicyProvider; +import org.apache.hadoop.security.authorize.Service; +import org.apache.hadoop.security.authorize.ServiceAuthorizationManager; + +/** Unit tests for RPC. */ +public class TestRPC extends TestCase { + private static final String ADDRESS = "0.0.0.0"; + + public static final Log LOG = + LogFactory.getLog(TestRPC.class); + + private static Configuration conf = new Configuration(); + + int datasize = 1024*100; + int numThreads = 50; + + public TestRPC(String name) { super(name); } + + public interface TestProtocol extends VersionedProtocol { + public static final long versionID = 1L; + + void ping() throws IOException; + void slowPing(boolean shouldSlow) throws IOException; + String echo(String value) throws IOException; + String[] echo(String[] value) throws IOException; + Writable echo(Writable value) throws IOException; + int add(int v1, int v2) throws IOException; + int add(int[] values) throws IOException; + int error() throws IOException; + void testServerGet() throws IOException; + int[] exchange(int[] values) throws IOException; + } + + public class TestImpl implements TestProtocol { + int fastPingCounter = 0; + + public long getProtocolVersion(String protocol, long clientVersion) { + return TestProtocol.versionID; + } + + public void ping() {} + + public synchronized void slowPing(boolean shouldSlow) { + if (shouldSlow) { + while (fastPingCounter < 2) { + try { + wait(); // slow response until two fast pings happened + } catch (InterruptedException ignored) {} + } + fastPingCounter -= 2; + } else { + fastPingCounter++; + notify(); + } + } + + public String echo(String value) throws IOException { return value; } + + public String[] echo(String[] values) throws IOException { return values; } + + public Writable echo(Writable writable) { + return writable; + } + public int add(int v1, int v2) { + return v1 + v2; + } + + public int add(int[] values) { + int sum = 0; + for (int i = 0; i < values.length; i++) { + sum += values[i]; + } + return sum; + } + + public int error() throws IOException { + throw new IOException("bobo"); + } + + public void testServerGet() throws IOException { + if (!(Server.get() instanceof RPC.Server)) { + throw new IOException("Server.get() failed"); + } + } + + public int[] exchange(int[] values) { + for (int i = 0; i < values.length; i++) { + values[i] = i; + } + return values; + } + } + + // + // an object that does a bunch of transactions + // + static class Transactions implements Runnable { + int datasize; + TestProtocol proxy; + + Transactions(TestProtocol proxy, int datasize) { + this.proxy = proxy; + this.datasize = datasize; + } + + // do two RPC that transfers data. + public void run() { + int[] indata = new int[datasize]; + int[] outdata = null; + int val = 0; + try { + outdata = proxy.exchange(indata); + val = proxy.add(1,2); + } catch (IOException e) { + assertTrue("Exception from RPC exchange() " + e, false); + } + assertEquals(indata.length, outdata.length); + assertEquals(val, 3); + for (int i = 0; i < outdata.length; i++) { + assertEquals(outdata[i], i); + } + } + } + + // + // A class that does an RPC but does not read its response. + // + static class SlowRPC implements Runnable { + private TestProtocol proxy; + private volatile boolean done; + + SlowRPC(TestProtocol proxy) { + this.proxy = proxy; + done = false; + } + + boolean isDone() { + return done; + } + + public void run() { + try { + proxy.slowPing(true); // this would hang until two fast pings happened + done = true; + } catch (IOException e) { + assertTrue("SlowRPC ping exception " + e, false); + } + } + } + + public void testSlowRpc() throws Exception { + System.out.println("Testing Slow RPC"); + // create a server with two handlers + Server server = RPC.getServer(new TestImpl(), ADDRESS, 0, 2, false, conf); + TestProtocol proxy = null; + + try { + server.start(); + + InetSocketAddress addr = NetUtils.getConnectAddress(server); + + // create a client + proxy = (TestProtocol)RPC.getProxy( + TestProtocol.class, TestProtocol.versionID, addr, conf); + + SlowRPC slowrpc = new SlowRPC(proxy); + Thread thread = new Thread(slowrpc, "SlowRPC"); + thread.start(); // send a slow RPC, which won't return until two fast pings + assertTrue("Slow RPC should not have finished1.", !slowrpc.isDone()); + + proxy.slowPing(false); // first fast ping + + // verify that the first RPC is still stuck + assertTrue("Slow RPC should not have finished2.", !slowrpc.isDone()); + + proxy.slowPing(false); // second fast ping + + // Now the slow ping should be able to be executed + while (!slowrpc.isDone()) { + System.out.println("Waiting for slow RPC to get done."); + try { + Thread.sleep(1000); + } catch (InterruptedException e) {} + } + } finally { + server.stop(); + if (proxy != null) { + RPC.stopProxy(proxy); + } + System.out.println("Down slow rpc testing"); + } + } + + + public void testCalls() throws Exception { + Server server = RPC.getServer(new TestImpl(), ADDRESS, 0, conf); + TestProtocol proxy = null; + try { + server.start(); + + InetSocketAddress addr = NetUtils.getConnectAddress(server); + proxy = (TestProtocol)RPC.getProxy( + TestProtocol.class, TestProtocol.versionID, addr, conf); + + proxy.ping(); + + String stringResult = proxy.echo("foo"); + assertEquals(stringResult, "foo"); + + stringResult = proxy.echo((String)null); + assertEquals(stringResult, null); + + String[] stringResults = proxy.echo(new String[]{"foo","bar"}); + assertTrue(Arrays.equals(stringResults, new String[]{"foo","bar"})); + + stringResults = proxy.echo((String[])null); + assertTrue(Arrays.equals(stringResults, null)); + + UTF8 utf8Result = (UTF8)proxy.echo(new UTF8("hello world")); + assertEquals(utf8Result, new UTF8("hello world")); + + utf8Result = (UTF8)proxy.echo((UTF8)null); + assertEquals(utf8Result, null); + + int intResult = proxy.add(1, 2); + assertEquals(intResult, 3); + + intResult = proxy.add(new int[] {1, 2}); + assertEquals(intResult, 3); + + boolean caught = false; + try { + proxy.error(); + } catch (IOException e) { + LOG.debug("Caught " + e); + caught = true; + } + assertTrue(caught); + + proxy.testServerGet(); + + // create multiple threads and make them do large data transfers + System.out.println("Starting multi-threaded RPC test..."); + server.setSocketSendBufSize(1024); + Thread threadId[] = new Thread[numThreads]; + for (int i = 0; i < numThreads; i++) { + Transactions trans = new Transactions(proxy, datasize); + threadId[i] = new Thread(trans, "TransactionThread-" + i); + threadId[i].start(); + } + + // wait for all transactions to get over + System.out.println("Waiting for all threads to finish RPCs..."); + for (int i = 0; i < numThreads; i++) { + try { + threadId[i].join(); + } catch (InterruptedException e) { + i--; // retry + } + } + + // try some multi-calls + Method echo = + TestProtocol.class.getMethod("echo", new Class[] { String.class }); + String[] strings = (String[])RPC.call(echo, new String[][]{{"a"},{"b"}}, + new InetSocketAddress[] {addr, addr}, conf); + assertTrue(Arrays.equals(strings, new String[]{"a","b"})); + + Method ping = TestProtocol.class.getMethod("ping", new Class[] {}); + Object[] voids = (Object[])RPC.call(ping, new Object[][]{{},{}}, + new InetSocketAddress[] {addr, addr}, conf); + assertEquals(voids, null); + } finally { + server.stop(); + if(proxy!=null) RPC.stopProxy(proxy); + } + } + + public void testStandaloneClient() throws IOException { + try { + RPC.waitForProxy(TestProtocol.class, + TestProtocol.versionID, new InetSocketAddress(ADDRESS, 20), conf, 15000L); + fail("We should not have reached here"); + } catch (ConnectException ioe) { + //this is what we expected + } + } + + private static final String ACL_CONFIG = "test.protocol.acl"; + + private static class TestPolicyProvider extends PolicyProvider { + + @Override + public Service[] getServices() { + return new Service[] { new Service(ACL_CONFIG, TestProtocol.class) }; + } + + } + + private void doRPCs(Configuration conf, boolean expectFailure) throws Exception { + SecurityUtil.setPolicy(new ConfiguredPolicy(conf, new TestPolicyProvider())); + + Server server = RPC.getServer(new TestImpl(), ADDRESS, 0, 5, true, conf); + + TestProtocol proxy = null; + + server.start(); + + InetSocketAddress addr = NetUtils.getConnectAddress(server); + + try { + proxy = (TestProtocol)RPC.getProxy( + TestProtocol.class, TestProtocol.versionID, addr, conf); + proxy.ping(); + + if (expectFailure) { + fail("Expect RPC.getProxy to fail with AuthorizationException!"); + } + } catch (RemoteException e) { + if (expectFailure) { + assertTrue(e.unwrapRemoteException() instanceof AuthorizationException); + } else { + throw e; + } + } finally { + server.stop(); + if (proxy != null) { + RPC.stopProxy(proxy); + } + } + } + + public void testAuthorization() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean( + ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, true); + + // Expect to succeed + conf.set(ACL_CONFIG, "*"); + doRPCs(conf, false); + + // Reset authorization to expect failure + conf.set(ACL_CONFIG, "invalid invalid"); + doRPCs(conf, true); + } + + public static void main(String[] args) throws Exception { + + new TestRPC("test").testCalls(); + + } +} diff --git a/src/test/org/apache/hadoop/log/TestLogLevel.java b/src/test/org/apache/hadoop/log/TestLogLevel.java new file mode 100644 index 00000000000..f2443c04d90 --- /dev/null +++ b/src/test/org/apache/hadoop/log/TestLogLevel.java @@ -0,0 +1,78 @@ +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.hadoop.log; + +import java.io.*; +import java.net.*; + +import org.apache.hadoop.http.HttpServer; + +import junit.framework.TestCase; +import org.apache.commons.logging.*; +import org.apache.commons.logging.impl.*; +import org.apache.log4j.*; + +public class TestLogLevel extends TestCase { + static final PrintStream out = System.out; + + public void testDynamicLogLevel() throws Exception { + String logName = TestLogLevel.class.getName(); + Log testlog = LogFactory.getLog(logName); + + //only test Log4JLogger + if (testlog instanceof Log4JLogger) { + Logger log = ((Log4JLogger)testlog).getLogger(); + log.debug("log.debug1"); + log.info("log.info1"); + log.error("log.error1"); + assertTrue(!Level.ERROR.equals(log.getEffectiveLevel())); + + HttpServer server = new HttpServer("..", "localhost", 22222, true); + server.start(); + int port = server.getPort(); + + //servlet + URL url = new URL("http://localhost:" + port + + "/logLevel?log=" + logName + "&level=" + Level.ERROR); + out.println("*** Connecting to " + url); + URLConnection connection = url.openConnection(); + connection.connect(); + + BufferedReader in = new BufferedReader(new InputStreamReader( + connection.getInputStream())); + for(String line; (line = in.readLine()) != null; out.println(line)); + in.close(); + + log.debug("log.debug2"); + log.info("log.info2"); + log.error("log.error2"); + assertTrue(Level.ERROR.equals(log.getEffectiveLevel())); + + //command line + String[] args = {"-setlevel", "localhost:"+port, logName,""+Level.DEBUG}; + LogLevel.main(args); + log.debug("log.debug3"); + log.info("log.info3"); + log.error("log.error3"); + assertTrue(Level.DEBUG.equals(log.getEffectiveLevel())); + } + else { + out.println(testlog.getClass() + " not tested."); + } + } +} diff --git a/src/test/org/apache/hadoop/metrics/TestMetricsServlet.java b/src/test/org/apache/hadoop/metrics/TestMetricsServlet.java new file mode 100644 index 00000000000..8d5cfc9a553 --- /dev/null +++ b/src/test/org/apache/hadoop/metrics/TestMetricsServlet.java @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.metrics; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.hadoop.metrics.MetricsServlet.TagsMetricsPair; +import org.apache.hadoop.metrics.spi.NoEmitMetricsContext; +import org.apache.hadoop.metrics.spi.OutputRecord; +import org.mortbay.util.ajax.JSON; + +public class TestMetricsServlet extends TestCase { + MetricsContext nc1; + MetricsContext nc2; + // List containing nc1 and nc2. + List contexts; + OutputRecord outputRecord; + + /** + * Initializes, for testing, two NoEmitMetricsContext's, and adds one value + * to the first of them. + */ + public void setUp() throws IOException { + nc1 = new NoEmitMetricsContext(); + nc1.init("test1", ContextFactory.getFactory()); + nc2 = new NoEmitMetricsContext(); + nc2.init("test2", ContextFactory.getFactory()); + contexts = new ArrayList(); + contexts.add(nc1); + contexts.add(nc2); + + MetricsRecord r = nc1.createRecord("testRecord"); + + r.setTag("testTag1", "testTagValue1"); + r.setTag("testTag2", "testTagValue2"); + r.setMetric("testMetric1", 1); + r.setMetric("testMetric2", 33); + r.update(); + + Map> m = nc1.getAllRecords(); + assertEquals(1, m.size()); + assertEquals(1, m.values().size()); + Collection outputRecords = m.values().iterator().next(); + assertEquals(1, outputRecords.size()); + outputRecord = outputRecords.iterator().next(); + } + + + + public void testTagsMetricsPair() throws IOException { + TagsMetricsPair pair = new TagsMetricsPair(outputRecord.getTagsCopy(), + outputRecord.getMetricsCopy()); + String s = JSON.toString(pair); + assertEquals( + "[{\"testTag1\":\"testTagValue1\",\"testTag2\":\"testTagValue2\"},"+ + "{\"testMetric1\":1,\"testMetric2\":33}]", s); + } + + public void testGetMap() throws IOException { + MetricsServlet servlet = new MetricsServlet(); + Map>> m = servlet.makeMap(contexts); + assertEquals("Map missing contexts", 2, m.size()); + assertTrue(m.containsKey("test1")); + + Map> m2 = m.get("test1"); + + assertEquals("Missing records", 1, m2.size()); + assertTrue(m2.containsKey("testRecord")); + assertEquals("Wrong number of tags-values pairs.", 1, m2.get("testRecord").size()); + } + + public void testPrintMap() throws IOException { + StringWriter sw = new StringWriter(); + PrintWriter out = new PrintWriter(sw); + MetricsServlet servlet = new MetricsServlet(); + servlet.printMap(out, servlet.makeMap(contexts)); + + String EXPECTED = "" + + "test1\n" + + " testRecord\n" + + " {testTag1=testTagValue1,testTag2=testTagValue2}:\n" + + " testMetric1=1\n" + + " testMetric2=33\n" + + "test2\n"; + assertEquals(EXPECTED, sw.toString()); + } +} diff --git a/src/test/org/apache/hadoop/metrics/spi/TestOutputRecord.java b/src/test/org/apache/hadoop/metrics/spi/TestOutputRecord.java new file mode 100644 index 00000000000..02e94a9f1b0 --- /dev/null +++ b/src/test/org/apache/hadoop/metrics/spi/TestOutputRecord.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.metrics.spi; + +import org.apache.hadoop.metrics.spi.AbstractMetricsContext.MetricMap; +import org.apache.hadoop.metrics.spi.AbstractMetricsContext.TagMap; + +import junit.framework.TestCase; + +public class TestOutputRecord extends TestCase { + public void testCopy() { + TagMap tags = new TagMap(); + tags.put("tagkey", "tagval"); + MetricMap metrics = new MetricMap(); + metrics.put("metrickey", 123.4); + OutputRecord r = new OutputRecord(tags, metrics); + + assertEquals(tags, r.getTagsCopy()); + assertNotSame(tags, r.getTagsCopy()); + assertEquals(metrics, r.getMetricsCopy()); + assertNotSame(metrics, r.getMetricsCopy()); + } +} diff --git a/src/test/org/apache/hadoop/net/StaticMapping.java b/src/test/org/apache/hadoop/net/StaticMapping.java new file mode 100644 index 00000000000..c3923ed9510 --- /dev/null +++ b/src/test/org/apache/hadoop/net/StaticMapping.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.net; + +import java.util.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; + +/** + * Implements the {@link DNSToSwitchMapping} via static mappings. Used + * in testcases that simulate racks. + * + */ +public class StaticMapping extends Configured implements DNSToSwitchMapping { + public void setconf(Configuration conf) { + String[] mappings = conf.getStrings("hadoop.configured.node.mapping"); + if (mappings != null) { + for (int i = 0; i < mappings.length; i++) { + String str = mappings[i]; + String host = str.substring(0, str.indexOf('=')); + String rack = str.substring(str.indexOf('=') + 1); + addNodeToRack(host, rack); + } + } + } + /* Only one instance per JVM */ + private static Map nameToRackMap = new HashMap(); + + static synchronized public void addNodeToRack(String name, String rackId) { + nameToRackMap.put(name, rackId); + } + public List resolve(List names) { + List m = new ArrayList(); + synchronized (nameToRackMap) { + for (String name : names) { + String rackId; + if ((rackId = nameToRackMap.get(name)) != null) { + m.add(rackId); + } else { + m.add(NetworkTopology.DEFAULT_RACK); + } + } + return m; + } + } +} diff --git a/src/test/org/apache/hadoop/net/TestDNS.java b/src/test/org/apache/hadoop/net/TestDNS.java new file mode 100644 index 00000000000..5825ecf8c63 --- /dev/null +++ b/src/test/org/apache/hadoop/net/TestDNS.java @@ -0,0 +1,150 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.net; + +import junit.framework.TestCase; + +import java.net.UnknownHostException; +import java.net.InetAddress; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import javax.naming.NameNotFoundException; + +/** + * + */ +public class TestDNS extends TestCase { + + private static final Log LOG = LogFactory.getLog(TestDNS.class); + private static final String DEFAULT = "default"; + + /** + * Constructs a test case with the given name. + * + * @param name test name + */ + public TestDNS(String name) { + super(name); + } + + /** + * Test that asking for the default hostname works + * @throws Exception if hostname lookups fail */ + public void testGetLocalHost() throws Exception { + String hostname = DNS.getDefaultHost(DEFAULT); + assertNotNull(hostname); + } + + /** + * Test that repeated calls to getting the local host are fairly fast, and + * hence that caching is being used + * @throws Exception if hostname lookups fail + */ + public void testGetLocalHostIsFast() throws Exception { + String hostname = DNS.getDefaultHost(DEFAULT); + assertNotNull(hostname); + long t1 = System.currentTimeMillis(); + String hostname2 = DNS.getDefaultHost(DEFAULT); + long t2 = System.currentTimeMillis(); + String hostname3 = DNS.getDefaultHost(DEFAULT); + long t3 = System.currentTimeMillis(); + assertEquals(hostname3, hostname2); + assertEquals(hostname2, hostname); + long interval2 = t3 - t2; + assertTrue( + "It is taking to long to determine the local host -caching is not working", + interval2 < 20000); + } + + /** + * Test that our local IP address is not null + * @throws Exception if something went wrong + */ + public void testLocalHostHasAnAddress() throws Exception { + assertNotNull(getLocalIPAddr()); + } + + private InetAddress getLocalIPAddr() throws UnknownHostException { + String hostname = DNS.getDefaultHost(DEFAULT); + InetAddress localhost = InetAddress.getByName(hostname); + return localhost; + } + + /** + * Test that passing a null pointer is as the interface + * fails with a NullPointerException + * @throws Exception if something went wrong + */ + public void testNullInterface() throws Exception { + try { + String host = DNS.getDefaultHost(null); + fail("Expected a NullPointerException, got " + host); + } catch (NullPointerException expected) { + //this is expected + } + } + + /** + * Get the IP addresses of an unknown interface, expect to get something + * back + * @throws Exception if something went wrong + */ + public void testIPsOfUnknownInterface() throws Exception { + String[] ips = DNS.getIPs("name-of-an-unknown-interface"); + assertNotNull(ips); + assertTrue(ips.length > 0); + } + + /** + * TestCase: get our local address and reverse look it up + * @throws Exception if that fails + */ + public void testRDNS() throws Exception { + InetAddress localhost = getLocalIPAddr(); + try { + String s = DNS.reverseDns(localhost, null); + LOG.info("Local revers DNS hostname is " + s); + } catch (NameNotFoundException e) { + if (!localhost.isLinkLocalAddress() || localhost.isLoopbackAddress()) { + //these addresses probably won't work with rDNS anyway, unless someone + //has unusual entries in their DNS server mapping 1.0.0.127 to localhost + LOG.info("Reverse DNS failing as due to incomplete networking", e); + LOG.info("Address is " + localhost + + " Loopback=" + localhost.isLoopbackAddress() + + " Linklocal=" + localhost.isLinkLocalAddress()); + } + + } + } + + /** + * Test that the name "localhost" resolves to something. + * + * If this fails, your machine's network is in a mess, go edit /etc/hosts + * @throws Exception for any problems + */ + public void testLocalhostResolves() throws Exception { + InetAddress localhost = InetAddress.getByName("localhost"); + assertNotNull("localhost is null", localhost); + LOG.info("Localhost IPAddr is " + localhost.toString()); + } +} diff --git a/src/test/org/apache/hadoop/net/TestScriptBasedMapping.java b/src/test/org/apache/hadoop/net/TestScriptBasedMapping.java new file mode 100644 index 00000000000..144dbaa0e36 --- /dev/null +++ b/src/test/org/apache/hadoop/net/TestScriptBasedMapping.java @@ -0,0 +1,46 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.net; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; + +import junit.framework.TestCase; + +public class TestScriptBasedMapping extends TestCase { + + public void testNoArgsMeansNoResult() { + ScriptBasedMapping mapping = new ScriptBasedMapping(); + + Configuration conf = new Configuration(); + conf.setInt(ScriptBasedMapping.SCRIPT_ARG_COUNT_KEY, + ScriptBasedMapping.MIN_ALLOWABLE_ARGS - 1); + conf.set(ScriptBasedMapping.SCRIPT_FILENAME_KEY, "any-filename"); + + mapping.setConf(conf); + + List names = new ArrayList(); + names.add("some.machine.name"); + names.add("other.machine.name"); + + List result = mapping.resolve(names); + assertNull(result); + } +} diff --git a/src/test/org/apache/hadoop/net/TestSocketIOWithTimeout.java b/src/test/org/apache/hadoop/net/TestSocketIOWithTimeout.java new file mode 100644 index 00000000000..53f320917c5 --- /dev/null +++ b/src/test/org/apache/hadoop/net/TestSocketIOWithTimeout.java @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.net; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.SocketTimeoutException; +import java.nio.channels.Pipe; +import java.util.Arrays; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import junit.framework.TestCase; + +/** + * This tests timout out from SocketInputStream and + * SocketOutputStream using pipes. + * + * Normal read and write using these streams are tested by pretty much + * every DFS unit test. + */ +public class TestSocketIOWithTimeout extends TestCase { + + static Log LOG = LogFactory.getLog(TestSocketIOWithTimeout.class); + + private static int TIMEOUT = 1*1000; + private static String TEST_STRING = "1234567890"; + + private void doIO(InputStream in, OutputStream out) throws IOException { + /* Keep on writing or reading until we get SocketTimeoutException. + * It expects this exception to occur within 100 millis of TIMEOUT. + */ + byte buf[] = new byte[4192]; + + while (true) { + long start = System.currentTimeMillis(); + try { + if (in != null) { + in.read(buf); + } else { + out.write(buf); + } + } catch (SocketTimeoutException e) { + long diff = System.currentTimeMillis() - start; + LOG.info("Got SocketTimeoutException as expected after " + + diff + " millis : " + e.getMessage()); + assertTrue(Math.abs(TIMEOUT - diff) <= 200); + break; + } + } + } + + /** + * Just reads one byte from the input stream. + */ + static class ReadRunnable implements Runnable { + private InputStream in; + + public ReadRunnable(InputStream in) { + this.in = in; + } + public void run() { + try { + in.read(); + } catch (IOException e) { + LOG.info("Got expection while reading as expected : " + + e.getMessage()); + return; + } + assertTrue(false); + } + } + + public void testSocketIOWithTimeout() throws IOException { + + // first open pipe: + Pipe pipe = Pipe.open(); + Pipe.SourceChannel source = pipe.source(); + Pipe.SinkChannel sink = pipe.sink(); + + try { + InputStream in = new SocketInputStream(source, TIMEOUT); + OutputStream out = new SocketOutputStream(sink, TIMEOUT); + + byte[] writeBytes = TEST_STRING.getBytes(); + byte[] readBytes = new byte[writeBytes.length]; + + out.write(writeBytes); + doIO(null, out); + + in.read(readBytes); + assertTrue(Arrays.equals(writeBytes, readBytes)); + doIO(in, null); + + /* + * Verify that it handles interrupted threads properly. + * Use a large timeout and expect the thread to return quickly. + */ + in = new SocketInputStream(source, 0); + Thread thread = new Thread(new ReadRunnable(in)); + thread.start(); + + try { + Thread.sleep(1000); + } catch (InterruptedException ignored) {} + + thread.interrupt(); + + try { + thread.join(); + } catch (InterruptedException e) { + throw new IOException("Unexpected InterruptedException : " + e); + } + + //make sure the channels are still open + assertTrue(source.isOpen()); + assertTrue(sink.isOpen()); + + out.close(); + assertFalse(sink.isOpen()); + + // close sink and expect -1 from source.read() + assertEquals(-1, in.read()); + + // make sure close() closes the underlying channel. + in.close(); + assertFalse(source.isOpen()); + + } finally { + if (source != null) { + source.close(); + } + if (sink != null) { + sink.close(); + } + } + } +} diff --git a/src/test/org/apache/hadoop/record/FromCpp.java b/src/test/org/apache/hadoop/record/FromCpp.java new file mode 100644 index 00000000000..2cd2271f43b --- /dev/null +++ b/src/test/org/apache/hadoop/record/FromCpp.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.record; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.TreeMap; +import junit.framework.*; + +/** + */ +public class FromCpp extends TestCase { + + public FromCpp(String testName) { + super(testName); + } + + protected void setUp() throws Exception { + } + + protected void tearDown() throws Exception { + } + + public void testBinary() { + File tmpfile; + try { + tmpfile = new File("/temp/hadooptmp.dat"); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + FileInputStream istream = new FileInputStream(tmpfile); + BinaryRecordInput in = new BinaryRecordInput(istream); + RecRecord1 r2 = new RecRecord1(); + r2.deserialize(in, ""); + istream.close(); + assertTrue(r1.equals(r2)); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public void testCsv() { + File tmpfile; + try { + tmpfile = new File("/temp/hadooptmp.txt"); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + FileInputStream istream = new FileInputStream(tmpfile); + CsvRecordInput in = new CsvRecordInput(istream); + RecRecord1 r2 = new RecRecord1(); + r2.deserialize(in, ""); + istream.close(); + assertTrue(r1.equals(r2)); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public void testXml() { + File tmpfile; + try { + tmpfile = new File("/temp/hadooptmp.xml"); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + FileInputStream istream = new FileInputStream(tmpfile); + XmlRecordInput in = new XmlRecordInput(istream); + RecRecord1 r2 = new RecRecord1(); + r2.deserialize(in, ""); + istream.close(); + assertTrue(r1.equals(r2)); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + +} diff --git a/src/test/org/apache/hadoop/record/RecordBench.java b/src/test/org/apache/hadoop/record/RecordBench.java new file mode 100644 index 00000000000..1cba75ed804 --- /dev/null +++ b/src/test/org/apache/hadoop/record/RecordBench.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.record; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.lang.reflect.Array; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.Random; + +/** + * Benchmark for various types of serializations + */ +public class RecordBench { + + private static class Times { + long init; + long serialize; + long deserialize; + long write; + long readFields; + }; + + private static final long SEED = 0xDEADBEEFL; + private static final Random rand = new Random(); + + /** Do not allow to create a new instance of RecordBench */ + private RecordBench() {} + + private static void initBuffers(Record[] buffers) { + final int BUFLEN = 32; + for (int idx = 0; idx < buffers.length; idx++) { + buffers[idx] = new RecBuffer(); + int buflen = rand.nextInt(BUFLEN); + byte[] bytes = new byte[buflen]; + rand.nextBytes(bytes); + ((RecBuffer)buffers[idx]).setData(new Buffer(bytes)); + } + } + + private static void initStrings(Record[] strings) { + final int STRLEN = 32; + for (int idx = 0; idx < strings.length; idx++) { + strings[idx] = new RecString(); + int strlen = rand.nextInt(STRLEN); + StringBuilder sb = new StringBuilder(strlen); + for (int ich = 0; ich < strlen; ich++) { + int cpt = 0; + while (true) { + cpt = rand.nextInt(0x10FFFF+1); + if (Utils.isValidCodePoint(cpt)) { + break; + } + } + sb.appendCodePoint(cpt); + } + ((RecString)strings[idx]).setData(sb.toString()); + } + } + + private static void initInts(Record[] ints) { + for (int idx = 0; idx < ints.length; idx++) { + ints[idx] = new RecInt(); + ((RecInt)ints[idx]).setData(rand.nextInt()); + } + } + + private static Record[] makeArray(String type, int numRecords, Times times) { + Method init = null; + try { + init = RecordBench.class.getDeclaredMethod("init"+ + toCamelCase(type) + "s", + new Class[] {Record[].class}); + } catch (NoSuchMethodException ex) { + throw new RuntimeException(ex); + } + + Record[] records = new Record[numRecords]; + times.init = System.nanoTime(); + try { + init.invoke(null, new Object[]{records}); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + times.init = System.nanoTime() - times.init; + return records; + } + + private static void runBinaryBench(String type, int numRecords, Times times) + throws IOException { + Record[] records = makeArray(type, numRecords, times); + ByteArrayOutputStream bout = new ByteArrayOutputStream(); + BinaryRecordOutput rout = new BinaryRecordOutput(bout); + DataOutputStream dout = new DataOutputStream(bout); + + for(int idx = 0; idx < numRecords; idx++) { + records[idx].serialize(rout); + } + bout.reset(); + + times.serialize = System.nanoTime(); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].serialize(rout); + } + times.serialize = System.nanoTime() - times.serialize; + + byte[] serialized = bout.toByteArray(); + ByteArrayInputStream bin = new ByteArrayInputStream(serialized); + BinaryRecordInput rin = new BinaryRecordInput(bin); + + times.deserialize = System.nanoTime(); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].deserialize(rin); + } + times.deserialize = System.nanoTime() - times.deserialize; + + bout.reset(); + + times.write = System.nanoTime(); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].write(dout); + } + times.write = System.nanoTime() - times.write; + + bin.reset(); + DataInputStream din = new DataInputStream(bin); + + times.readFields = System.nanoTime(); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].readFields(din); + } + times.readFields = System.nanoTime() - times.readFields; + } + + private static void runCsvBench(String type, int numRecords, Times times) + throws IOException { + Record[] records = makeArray(type, numRecords, times); + ByteArrayOutputStream bout = new ByteArrayOutputStream(); + CsvRecordOutput rout = new CsvRecordOutput(bout); + + for(int idx = 0; idx < numRecords; idx++) { + records[idx].serialize(rout); + } + bout.reset(); + + times.serialize = System.nanoTime(); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].serialize(rout); + } + times.serialize = System.nanoTime() - times.serialize; + + byte[] serialized = bout.toByteArray(); + ByteArrayInputStream bin = new ByteArrayInputStream(serialized); + CsvRecordInput rin = new CsvRecordInput(bin); + + times.deserialize = System.nanoTime(); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].deserialize(rin); + } + times.deserialize = System.nanoTime() - times.deserialize; + } + + private static void runXmlBench(String type, int numRecords, Times times) + throws IOException { + Record[] records = makeArray(type, numRecords, times); + ByteArrayOutputStream bout = new ByteArrayOutputStream(); + XmlRecordOutput rout = new XmlRecordOutput(bout); + + for(int idx = 0; idx < numRecords; idx++) { + records[idx].serialize(rout); + } + bout.reset(); + + bout.write("\n".getBytes()); + + times.serialize = System.nanoTime(); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].serialize(rout); + } + times.serialize = System.nanoTime() - times.serialize; + + bout.write("\n".getBytes()); + + byte[] serialized = bout.toByteArray(); + ByteArrayInputStream bin = new ByteArrayInputStream(serialized); + + times.deserialize = System.nanoTime(); + XmlRecordInput rin = new XmlRecordInput(bin); + for(int idx = 0; idx < numRecords; idx++) { + records[idx].deserialize(rin); + } + times.deserialize = System.nanoTime() - times.deserialize; + } + + private static void printTimes(String type, + String format, + int numRecords, + Times times) { + System.out.println("Type: " + type + " Format: " + format + + " #Records: "+numRecords); + if (times.init != 0) { + System.out.println("Initialization Time (Per record) : "+ + times.init/numRecords + " Nanoseconds"); + } + + if (times.serialize != 0) { + System.out.println("Serialization Time (Per Record) : "+ + times.serialize/numRecords + " Nanoseconds"); + } + + if (times.deserialize != 0) { + System.out.println("Deserialization Time (Per Record) : "+ + times.deserialize/numRecords + " Nanoseconds"); + } + + if (times.write != 0) { + System.out.println("Write Time (Per Record) : "+ + times.write/numRecords + " Nanoseconds"); + } + + if (times.readFields != 0) { + System.out.println("ReadFields Time (Per Record) : "+ + times.readFields/numRecords + " Nanoseconds"); + } + + System.out.println(); + } + + private static String toCamelCase(String inp) { + char firstChar = inp.charAt(0); + if (Character.isLowerCase(firstChar)) { + return ""+Character.toUpperCase(firstChar) + inp.substring(1); + } + return inp; + } + + private static void exitOnError() { + String usage = "RecordBench {buffer|string|int}"+ + " {binary|csv|xml} "; + System.out.println(usage); + System.exit(1); + } + + /** + * @param args the command line arguments + */ + public static void main(String[] args) throws IOException { + String version = "RecordBench v0.1"; + System.out.println(version+"\n"); + + if (args.length != 3) { + exitOnError(); + } + + String typeName = args[0]; + String format = args[1]; + int numRecords = Integer.decode(args[2]).intValue(); + + Method bench = null; + try { + bench = RecordBench.class.getDeclaredMethod("run"+ + toCamelCase(format) + "Bench", + new Class[] {String.class, Integer.TYPE, Times.class}); + } catch (NoSuchMethodException ex) { + ex.printStackTrace(); + exitOnError(); + } + + if (numRecords < 0) { + exitOnError(); + } + + // dry run + rand.setSeed(SEED); + Times times = new Times(); + try { + bench.invoke(null, new Object[] {typeName, numRecords, times}); + } catch (Exception ex) { + ex.printStackTrace(); + System.exit(1); + } + + // timed run + rand.setSeed(SEED); + try { + bench.invoke(null, new Object[] {typeName, numRecords, times}); + } catch (Exception ex) { + ex.printStackTrace(); + System.exit(1); + } + printTimes(typeName, format, numRecords, times); + } +} diff --git a/src/test/org/apache/hadoop/record/TestBuffer.java b/src/test/org/apache/hadoop/record/TestBuffer.java new file mode 100644 index 00000000000..3012fa6ff46 --- /dev/null +++ b/src/test/org/apache/hadoop/record/TestBuffer.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.record; + +import junit.framework.*; + +/** + * A Unit test for Record I/O Buffer class + */ +public class TestBuffer extends TestCase { + + public TestBuffer(String testName) { + super(testName); + } + + /** + * Test of set method, of class org.apache.hadoop.record.Buffer. + */ + public void testSet() { + final byte[] bytes = new byte[10]; + final Buffer instance = new Buffer(); + + instance.set(bytes); + + assertEquals("set failed", bytes, instance.get()); + } + + /** + * Test of copy method, of class org.apache.hadoop.record.Buffer. + */ + public void testCopy() { + final byte[] bytes = new byte[10]; + final int offset = 6; + final int length = 3; + for (int idx = 0; idx < 10; idx ++) { + bytes[idx] = (byte) idx; + } + final Buffer instance = new Buffer(); + + instance.copy(bytes, offset, length); + + assertEquals("copy failed", 3, instance.getCapacity()); + assertEquals("copy failed", 3, instance.get().length); + for (int idx = 0; idx < 3; idx++) { + assertEquals("Buffer content corrupted", idx+6, instance.get()[idx]); + } + } + + /** + * Test of getCount method, of class org.apache.hadoop.record.Buffer. + */ + public void testGetCount() { + final Buffer instance = new Buffer(); + + final int expResult = 0; + final int result = instance.getCount(); + assertEquals("getSize failed", expResult, result); + } + + /** + * Test of getCapacity method, of class org.apache.hadoop.record.Buffer. + */ + public void testGetCapacity() { + final Buffer instance = new Buffer(); + + final int expResult = 0; + final int result = instance.getCapacity(); + assertEquals("getCapacity failed", expResult, result); + + instance.setCapacity(100); + assertEquals("setCapacity failed", 100, instance.getCapacity()); + } + + /** + * Test of truncate method, of class org.apache.hadoop.record.Buffer. + */ + public void testTruncate() { + final Buffer instance = new Buffer(); + instance.setCapacity(100); + assertEquals("setCapacity failed", 100, instance.getCapacity()); + + instance.truncate(); + assertEquals("truncate failed", 0, instance.getCapacity()); + } + + /** + * Test of append method, of class org.apache.hadoop.record.Buffer. + */ + public void testAppend() { + final byte[] bytes = new byte[100]; + final int offset = 0; + final int length = 100; + for (int idx = 0; idx < 100; idx++) { + bytes[idx] = (byte) (100-idx); + } + + final Buffer instance = new Buffer(); + + instance.append(bytes, offset, length); + + assertEquals("Buffer size mismatch", 100, instance.getCount()); + + for (int idx = 0; idx < 100; idx++) { + assertEquals("Buffer contents corrupted", 100-idx, instance.get()[idx]); + } + + } +} diff --git a/src/test/org/apache/hadoop/record/TestRecordIO.java b/src/test/org/apache/hadoop/record/TestRecordIO.java new file mode 100644 index 00000000000..163ec1b00b2 --- /dev/null +++ b/src/test/org/apache/hadoop/record/TestRecordIO.java @@ -0,0 +1,199 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.record; + +import java.io.IOException; +import junit.framework.*; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.util.ArrayList; +import java.util.TreeMap; + +/** + */ +public class TestRecordIO extends TestCase { + + public TestRecordIO(String testName) { + super(testName); + } + + protected void setUp() throws Exception { + } + + protected void tearDown() throws Exception { + } + + public void testBinary() { + File tmpfile; + try { + tmpfile = File.createTempFile("hadooprec", ".dat"); + FileOutputStream ostream = new FileOutputStream(tmpfile); + BinaryRecordOutput out = new BinaryRecordOutput(ostream); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(-4567); + r1.setLongVal(-2367L); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + RecRecord0 r0 = new RecRecord0(); + r0.setStringVal("other random text"); + r1.setRecordVal(r0); + r1.serialize(out, ""); + ostream.close(); + FileInputStream istream = new FileInputStream(tmpfile); + BinaryRecordInput in = new BinaryRecordInput(istream); + RecRecord1 r2 = new RecRecord1(); + r2.deserialize(in, ""); + istream.close(); + tmpfile.delete(); + assertTrue("Serialized and deserialized records do not match.", r1.equals(r2)); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public void testCsv() { + File tmpfile; + try { + tmpfile = File.createTempFile("hadooprec", ".txt"); + FileOutputStream ostream = new FileOutputStream(tmpfile); + CsvRecordOutput out = new CsvRecordOutput(ostream); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + RecRecord0 r0 = new RecRecord0(); + r0.setStringVal("other random text"); + r1.setRecordVal(r0); + r1.serialize(out, ""); + ostream.close(); + FileInputStream istream = new FileInputStream(tmpfile); + CsvRecordInput in = new CsvRecordInput(istream); + RecRecord1 r2 = new RecRecord1(); + r2.deserialize(in, ""); + istream.close(); + tmpfile.delete(); + assertTrue("Serialized and deserialized records do not match.", r1.equals(r2)); + + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public void testToString() { + try { + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("random text"); + byte[] barr = new byte[256]; + for (int idx = 0; idx < 256; idx++) { + barr[idx] = (byte) idx; + } + r1.setBufferVal(new Buffer(barr)); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + RecRecord0 r0 = new RecRecord0(); + r0.setStringVal("other random text"); + r1.setRecordVal(r0); + System.err.println("Illustrating toString bug"+r1.toString()); + System.err.println("Illustrating toString bug"+r1.toString()); + } catch (Throwable ex) { + assertTrue("Record.toString cannot be invoked twice in succession."+ + "This bug has been fixed in the latest version.", false); + } + } + + public void testXml() { + File tmpfile; + try { + tmpfile = File.createTempFile("hadooprec", ".xml"); + FileOutputStream ostream = new FileOutputStream(tmpfile); + XmlRecordOutput out = new XmlRecordOutput(ostream); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("ran\002dom < %text<&more\uffff"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + RecRecord0 r0 = new RecRecord0(); + r0.setStringVal("other %rando\007m & >&more text"); + r1.setRecordVal(r0); + r1.serialize(out, ""); + ostream.close(); + FileInputStream istream = new FileInputStream(tmpfile); + XmlRecordInput in = new XmlRecordInput(istream); + RecRecord1 r2 = new RecRecord1(); + r2.deserialize(in, ""); + istream.close(); + tmpfile.delete(); + assertTrue("Serialized and deserialized records do not match.", r1.equals(r2)); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public void testCloneable() { + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(-4567); + r1.setLongVal(-2367L); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + RecRecord0 r0 = new RecRecord0(); + r0.setStringVal("other random text"); + r1.setRecordVal(r0); + try { + RecRecord1 r2 = (RecRecord1) r1.clone(); + assertTrue("Cloneable semantics violated. r1==r2", r1 != r2); + assertTrue("Cloneable semantics violated. r1.getClass() != r2.getClass()", + r1.getClass() == r2.getClass()); + assertTrue("Cloneable semantics violated. !r2.equals(r1)", r2.equals(r1)); + } catch (final CloneNotSupportedException ex) { + ex.printStackTrace(); + } + } +} diff --git a/src/test/org/apache/hadoop/record/TestRecordVersioning.java b/src/test/org/apache/hadoop/record/TestRecordVersioning.java new file mode 100644 index 00000000000..129ba2ced86 --- /dev/null +++ b/src/test/org/apache/hadoop/record/TestRecordVersioning.java @@ -0,0 +1,239 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.record; + +import java.io.IOException; +import junit.framework.*; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.util.ArrayList; +import java.util.TreeMap; +import org.apache.hadoop.record.meta.RecordTypeInfo; + +/** + */ +public class TestRecordVersioning extends TestCase { + + public TestRecordVersioning(String testName) { + super(testName); + } + + protected void setUp() throws Exception { + } + + protected void tearDown() throws Exception { + } + + /* + * basic versioning + * write out a record and its type info, read it back using its typeinfo + */ + public void testBasic() { + File tmpfile, tmpRTIfile; + try { + tmpfile = File.createTempFile("hadooprec", ".dat"); + tmpRTIfile = File.createTempFile("hadooprti", ".dat"); + FileOutputStream ostream = new FileOutputStream(tmpfile); + BinaryRecordOutput out = new BinaryRecordOutput(ostream); + FileOutputStream oRTIstream = new FileOutputStream(tmpRTIfile); + BinaryRecordOutput outRTI = new BinaryRecordOutput(oRTIstream); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(-4567); + r1.setLongVal(-2367L); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + RecRecord0 r0 = new RecRecord0(); + r0.setStringVal("other random text"); + r1.setRecordVal(r0); + r1.serialize(out, ""); + ostream.close(); + // write out the type info + RecRecord1.getTypeInfo().serialize(outRTI); + oRTIstream.close(); + + // read + FileInputStream istream = new FileInputStream(tmpfile); + BinaryRecordInput in = new BinaryRecordInput(istream); + FileInputStream iRTIstream = new FileInputStream(tmpRTIfile); + BinaryRecordInput inRTI = new BinaryRecordInput(iRTIstream); + RecordTypeInfo rti = new RecordTypeInfo(); + rti.deserialize(inRTI); + iRTIstream.close(); + RecRecord1.setTypeFilter(rti); + RecRecord1 r2 = new RecRecord1(); + r2.deserialize(in, ""); + istream.close(); + tmpfile.delete(); + tmpRTIfile.delete(); + assertTrue("Serialized and deserialized versioned records do not match.", r1.equals(r2)); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + /* + * versioning + * write out a record and its type info, read back a similar record using the written record's typeinfo + */ + public void testVersioning() { + File tmpfile, tmpRTIfile; + try { + tmpfile = File.createTempFile("hadooprec", ".dat"); + tmpRTIfile = File.createTempFile("hadooprti", ".dat"); + FileOutputStream ostream = new FileOutputStream(tmpfile); + BinaryRecordOutput out = new BinaryRecordOutput(ostream); + FileOutputStream oRTIstream = new FileOutputStream(tmpRTIfile); + BinaryRecordOutput outRTI = new BinaryRecordOutput(oRTIstream); + + // we create an array of records to write + ArrayList recsWrite = new ArrayList(); + int i, j, k, l; + for (i=0; i<5; i++) { + RecRecordOld s1Rec = new RecRecordOld(); + + s1Rec.setName("This is record s1: " + i); + + ArrayList iA = new ArrayList(); + for (j=0; j<3; j++) { + iA.add(new Long(i+j)); + } + s1Rec.setIvec(iA); + + ArrayList> ssVec = new ArrayList>(); + for (j=0; j<2; j++) { + ArrayList sVec = new ArrayList(); + for (k=0; k<3; k++) { + RecRecord0 sRec = new RecRecord0("This is record s: ("+j+": "+k+")"); + sVec.add(sRec); + } + ssVec.add(sVec); + } + s1Rec.setSvec(ssVec); + + s1Rec.setInner(new RecRecord0("This is record s: " + i)); + + ArrayList>> aaaVec = new ArrayList>>(); + for (l=0; l<2; l++) { + ArrayList> aaVec = new ArrayList>(); + for (j=0; j<2; j++) { + ArrayList aVec = new ArrayList(); + for (k=0; k<3; k++) { + aVec.add(new String("THis is a nested string: (" + l + ": " + j + ": " + k + ")")); + } + aaVec.add(aVec); + } + aaaVec.add(aaVec); + } + s1Rec.setStrvec(aaaVec); + + s1Rec.setI1(100+i); + + java.util.TreeMap map1 = new java.util.TreeMap(); + map1.put(new Byte("23"), "23"); + map1.put(new Byte("11"), "11"); + s1Rec.setMap1(map1); + + java.util.TreeMap m1 = new java.util.TreeMap(); + java.util.TreeMap m2 = new java.util.TreeMap(); + m1.put(new Integer(5), 5L); + m1.put(new Integer(10), 10L); + m2.put(new Integer(15), 15L); + m2.put(new Integer(20), 20L); + java.util.ArrayList> vm1 = new java.util.ArrayList>(); + vm1.add(m1); + vm1.add(m2); + s1Rec.setMvec1(vm1); + java.util.ArrayList> vm2 = new java.util.ArrayList>(); + vm2.add(m1); + s1Rec.setMvec2(vm2); + + // add to our list + recsWrite.add(s1Rec); + } + + // write out to file + for (RecRecordOld rec: recsWrite) { + rec.serialize(out); + } + ostream.close(); + // write out the type info + RecRecordOld.getTypeInfo().serialize(outRTI); + oRTIstream.close(); + + // read + FileInputStream istream = new FileInputStream(tmpfile); + BinaryRecordInput in = new BinaryRecordInput(istream); + FileInputStream iRTIstream = new FileInputStream(tmpRTIfile); + BinaryRecordInput inRTI = new BinaryRecordInput(iRTIstream); + RecordTypeInfo rti = new RecordTypeInfo(); + + // read type info + rti.deserialize(inRTI); + iRTIstream.close(); + RecRecordNew.setTypeFilter(rti); + + // read records + ArrayList recsRead = new ArrayList(); + for (i=0; i> ss2Vec = s2In.getStrvec().get(j); + ArrayList> ss1Vec = s1Out.getStrvec().get(j); + for (k=0; k s2Vec = ss2Vec.get(k); + ArrayList s1Vec = ss1Vec.get(k); + for (l=0; l()); + r1.setMapVal(new TreeMap()); + r1.serialize(out, ""); + ostream.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public void testCsv() { + File tmpfile; + try { + tmpfile = new File("/tmp/hadooptemp.txt"); + FileOutputStream ostream = new FileOutputStream(tmpfile); + CsvRecordOutput out = new CsvRecordOutput(ostream); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + r1.serialize(out, ""); + ostream.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + public void testXml() { + File tmpfile; + try { + tmpfile = new File("/tmp/hadooptemp.xml"); + FileOutputStream ostream = new FileOutputStream(tmpfile); + XmlRecordOutput out = new XmlRecordOutput(ostream); + RecRecord1 r1 = new RecRecord1(); + r1.setBoolVal(true); + r1.setByteVal((byte)0x66); + r1.setFloatVal(3.145F); + r1.setDoubleVal(1.5234); + r1.setIntVal(4567); + r1.setLongVal(0x5a5a5a5a5a5aL); + r1.setStringVal("random text"); + r1.setBufferVal(new Buffer()); + r1.setVectorVal(new ArrayList()); + r1.setMapVal(new TreeMap()); + r1.serialize(out, ""); + ostream.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } +} diff --git a/src/test/org/apache/hadoop/security/TestAccessControlList.java b/src/test/org/apache/hadoop/security/TestAccessControlList.java new file mode 100644 index 00000000000..57c5abf875a --- /dev/null +++ b/src/test/org/apache/hadoop/security/TestAccessControlList.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.security; + +import java.util.Iterator; +import java.util.Set; + +import org.apache.hadoop.security.SecurityUtil.AccessControlList; + +import junit.framework.TestCase; + +public class TestAccessControlList extends TestCase { + + public void testWildCardAccessControlList() throws Exception { + AccessControlList acl; + + acl = new AccessControlList("*"); + assertTrue(acl.allAllowed()); + + acl = new AccessControlList(" * "); + assertTrue(acl.allAllowed()); + + acl = new AccessControlList(" *"); + assertTrue(acl.allAllowed()); + + acl = new AccessControlList("* "); + assertTrue(acl.allAllowed()); + } + + public void testAccessControlList() throws Exception { + AccessControlList acl; + Set users; + Set groups; + + acl = new AccessControlList("drwho tardis"); + users = acl.getUsers(); + assertEquals(users.size(), 1); + assertEquals(users.iterator().next(), "drwho"); + groups = acl.getGroups(); + assertEquals(groups.size(), 1); + assertEquals(groups.iterator().next(), "tardis"); + + acl = new AccessControlList("drwho"); + users = acl.getUsers(); + assertEquals(users.size(), 1); + assertEquals(users.iterator().next(), "drwho"); + groups = acl.getGroups(); + assertEquals(groups.size(), 0); + + acl = new AccessControlList("drwho "); + users = acl.getUsers(); + assertEquals(users.size(), 1); + assertEquals(users.iterator().next(), "drwho"); + groups = acl.getGroups(); + assertEquals(groups.size(), 0); + + acl = new AccessControlList(" tardis"); + users = acl.getUsers(); + assertEquals(users.size(), 0); + groups = acl.getGroups(); + assertEquals(groups.size(), 1); + assertEquals(groups.iterator().next(), "tardis"); + + Iterator iter; + acl = new AccessControlList("drwho,joe tardis,users"); + users = acl.getUsers(); + assertEquals(users.size(), 2); + iter = users.iterator(); + assertEquals(iter.next(), "drwho"); + assertEquals(iter.next(), "joe"); + groups = acl.getGroups(); + assertEquals(groups.size(), 2); + iter = groups.iterator(); + assertEquals(iter.next(), "tardis"); + assertEquals(iter.next(), "users"); + + acl = new AccessControlList("drwho,joe tardis, users"); + users = acl.getUsers(); + assertEquals(users.size(), 2); + iter = users.iterator(); + assertEquals(iter.next(), "drwho"); + assertEquals(iter.next(), "joe"); + groups = acl.getGroups(); + assertEquals(groups.size(), 2); + iter = groups.iterator(); + assertEquals(iter.next(), "tardis"); + assertEquals(iter.next(), "users"); + } +} diff --git a/src/test/org/apache/hadoop/security/TestAccessToken.java b/src/test/org/apache/hadoop/security/TestAccessToken.java new file mode 100644 index 00000000000..cd3cc4c482a --- /dev/null +++ b/src/test/org/apache/hadoop/security/TestAccessToken.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.security; + +import java.util.EnumSet; + +import org.apache.hadoop.io.TestWritable; + +import junit.framework.TestCase; + +/** Unit tests for access tokens */ +public class TestAccessToken extends TestCase { + long accessKeyUpdateInterval = 10 * 60 * 1000; // 10 mins + long accessTokenLifetime = 2 * 60 * 1000; // 2 mins + long blockID1 = 0L; + long blockID2 = 10L; + long blockID3 = -108L; + + /** test Writable */ + public void testWritable() throws Exception { + TestWritable.testWritable(ExportedAccessKeys.DUMMY_KEYS); + AccessTokenHandler handler = new AccessTokenHandler(true, + accessKeyUpdateInterval, accessTokenLifetime); + ExportedAccessKeys keys = handler.exportKeys(); + TestWritable.testWritable(keys); + TestWritable.testWritable(AccessToken.DUMMY_TOKEN); + AccessToken token = handler.generateToken(blockID3, EnumSet + .allOf(AccessTokenHandler.AccessMode.class)); + TestWritable.testWritable(token); + } + + private void tokenGenerationAndVerification(AccessTokenHandler master, + AccessTokenHandler slave) throws Exception { + // single-mode tokens + for (AccessTokenHandler.AccessMode mode : AccessTokenHandler.AccessMode + .values()) { + // generated by master + AccessToken token1 = master.generateToken(blockID1, EnumSet.of(mode)); + assertTrue(master.checkAccess(token1, null, blockID1, mode)); + assertTrue(slave.checkAccess(token1, null, blockID1, mode)); + // generated by slave + AccessToken token2 = slave.generateToken(blockID2, EnumSet.of(mode)); + assertTrue(master.checkAccess(token2, null, blockID2, mode)); + assertTrue(slave.checkAccess(token2, null, blockID2, mode)); + } + // multi-mode tokens + AccessToken mtoken = master.generateToken(blockID3, EnumSet + .allOf(AccessTokenHandler.AccessMode.class)); + for (AccessTokenHandler.AccessMode mode : AccessTokenHandler.AccessMode + .values()) { + assertTrue(master.checkAccess(mtoken, null, blockID3, mode)); + assertTrue(slave.checkAccess(mtoken, null, blockID3, mode)); + } + } + + /** test access key and token handling */ + public void testAccessTokenHandler() throws Exception { + AccessTokenHandler masterHandler = new AccessTokenHandler(true, + accessKeyUpdateInterval, accessTokenLifetime); + AccessTokenHandler slaveHandler = new AccessTokenHandler(false, + accessKeyUpdateInterval, accessTokenLifetime); + ExportedAccessKeys keys = masterHandler.exportKeys(); + slaveHandler.setKeys(keys); + tokenGenerationAndVerification(masterHandler, slaveHandler); + // key updating + masterHandler.updateKeys(); + tokenGenerationAndVerification(masterHandler, slaveHandler); + keys = masterHandler.exportKeys(); + slaveHandler.setKeys(keys); + tokenGenerationAndVerification(masterHandler, slaveHandler); + } + +} diff --git a/src/test/org/apache/hadoop/security/TestUnixUserGroupInformation.java b/src/test/org/apache/hadoop/security/TestUnixUserGroupInformation.java new file mode 100644 index 00000000000..51880c2d1f6 --- /dev/null +++ b/src/test/org/apache/hadoop/security/TestUnixUserGroupInformation.java @@ -0,0 +1,103 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.security; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.TestWritable; + +import junit.framework.TestCase; + +/** Unit tests for UnixUserGroupInformation */ +public class TestUnixUserGroupInformation extends TestCase { + final private static String USER_NAME = "user1"; + final private static String GROUP1_NAME = "group1"; + final private static String GROUP2_NAME = "group2"; + final private static String GROUP3_NAME = "group3"; + final private static String[] GROUP_NAMES = + new String[]{GROUP1_NAME, GROUP2_NAME, GROUP3_NAME}; + + /** Test login method */ + public void testLogin() throws Exception { + Configuration conf = new Configuration(); + + // loin from unix + String userName = UnixUserGroupInformation.getUnixUserName(); + UnixUserGroupInformation curUserGroupInfo = + UnixUserGroupInformation.login(conf); + assertEquals(curUserGroupInfo.getUserName(), userName); + assertTrue(curUserGroupInfo == UnixUserGroupInformation.login(conf)); + + // login from the configuration + UnixUserGroupInformation userGroupInfo = new UnixUserGroupInformation( + USER_NAME, GROUP_NAMES ); + UnixUserGroupInformation.saveToConf(conf, + UnixUserGroupInformation.UGI_PROPERTY_NAME, userGroupInfo); + curUserGroupInfo = UnixUserGroupInformation.login(conf); + assertEquals(curUserGroupInfo, userGroupInfo); + assertTrue(curUserGroupInfo == UnixUserGroupInformation.login(conf)); + } + + /** test constructor */ + public void testConstructor() throws Exception { + UnixUserGroupInformation uugi = + new UnixUserGroupInformation(USER_NAME, GROUP_NAMES); + assertEquals(uugi, new UnixUserGroupInformation( new String[]{ + USER_NAME, GROUP1_NAME, GROUP2_NAME, GROUP3_NAME} )); + // failure test + testConstructorFailures(null, GROUP_NAMES); + testConstructorFailures("", GROUP_NAMES); + testConstructorFailures(USER_NAME, null); + testConstructorFailures(USER_NAME, new String[0]); + testConstructorFailures(USER_NAME, new String[]{null}); + testConstructorFailures(USER_NAME, new String[]{""}); + testConstructorFailures(USER_NAME, new String[]{GROUP1_NAME, null}); + testConstructorFailures(USER_NAME, + new String[]{GROUP1_NAME, null, GROUP2_NAME}); + } + + private void testConstructorFailures(String userName, String[] groupNames) { + boolean gotException = false; + try { + new UnixUserGroupInformation(userName, groupNames); + } catch (Exception e) { + gotException = true; + } + assertTrue(gotException); + } + + public void testEquals() throws Exception { + UnixUserGroupInformation uugi = + new UnixUserGroupInformation(USER_NAME, GROUP_NAMES); + + assertEquals(uugi, uugi); + assertEquals(uugi, new UnixUserGroupInformation(USER_NAME, GROUP_NAMES)); + assertEquals(uugi, new UnixUserGroupInformation(USER_NAME, + new String[]{GROUP1_NAME, GROUP3_NAME, GROUP2_NAME})); + assertFalse(uugi.equals(new UnixUserGroupInformation())); + assertFalse(uugi.equals(new UnixUserGroupInformation(USER_NAME, + new String[]{GROUP2_NAME, GROUP3_NAME, GROUP1_NAME}))); + } + + /** test Writable */ + public void testWritable() throws Exception { + UnixUserGroupInformation ugi = new UnixUserGroupInformation( + USER_NAME, GROUP_NAMES); + TestWritable.testWritable(ugi, new Configuration()); + } +} diff --git a/src/test/org/apache/hadoop/security/authorize/TestConfiguredPolicy.java b/src/test/org/apache/hadoop/security/authorize/TestConfiguredPolicy.java new file mode 100644 index 00000000000..203946cabd8 --- /dev/null +++ b/src/test/org/apache/hadoop/security/authorize/TestConfiguredPolicy.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.security.authorize; + +import java.security.Permission; + +import javax.security.auth.Subject; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.security.SecurityUtil; +import org.apache.hadoop.security.UnixUserGroupInformation; +import org.apache.hadoop.security.SecurityUtil.AccessControlList; + +import junit.framework.TestCase; + +public class TestConfiguredPolicy extends TestCase { + private static final String USER1 = "drwho"; + private static final String USER2 = "joe"; + private static final String[] GROUPS1 = new String[]{"tardis"}; + private static final String[] GROUPS2 = new String[]{"users"}; + + private static final String KEY_1 = "test.policy.1"; + private static final String KEY_2 = "test.policy.2"; + + public static class Protocol1 { + int i; + } + public static class Protocol2 { + int j; + } + + private static class TestPolicyProvider extends PolicyProvider { + @Override + public Service[] getServices() { + return new Service[] { + new Service(KEY_1, Protocol1.class), + new Service(KEY_2, Protocol2.class), + }; + } + } + + public void testConfiguredPolicy() throws Exception { + Configuration conf = new Configuration(); + conf.set(KEY_1, AccessControlList.WILDCARD_ACL_VALUE); + conf.set(KEY_2, USER1 + " " + GROUPS1[0]); + + ConfiguredPolicy policy = new ConfiguredPolicy(conf, new TestPolicyProvider()); + SecurityUtil.setPolicy(policy); + + Subject user1 = + SecurityUtil.getSubject(new UnixUserGroupInformation(USER1, GROUPS1)); + + // Should succeed + ServiceAuthorizationManager.authorize(user1, Protocol1.class); + + // Should fail + Subject user2 = + SecurityUtil.getSubject(new UnixUserGroupInformation(USER2, GROUPS2)); + boolean failed = false; + try { + ServiceAuthorizationManager.authorize(user2, Protocol2.class); + } catch (AuthorizationException ae) { + failed = true; + } + assertTrue(failed); + } +} diff --git a/src/test/org/apache/hadoop/test/CoreTestDriver.java b/src/test/org/apache/hadoop/test/CoreTestDriver.java new file mode 100644 index 00000000000..06590c9cdf8 --- /dev/null +++ b/src/test/org/apache/hadoop/test/CoreTestDriver.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.test; + +import org.apache.hadoop.io.TestArrayFile; +import org.apache.hadoop.io.TestSetFile; +import org.apache.hadoop.ipc.TestIPC; +import org.apache.hadoop.ipc.TestRPC; +import org.apache.hadoop.util.ProgramDriver; + +/** + * Driver for core tests. + */ +public class CoreTestDriver { + + private ProgramDriver pgd; + + public CoreTestDriver() { + this(new ProgramDriver()); + } + + public CoreTestDriver(ProgramDriver pgd) { + this.pgd = pgd; + try { + pgd.addClass("testsetfile", TestSetFile.class, + "A test for flat files of binary key/value pairs."); + pgd.addClass("testarrayfile", TestArrayFile.class, + "A test for flat files of binary key/value pairs."); + pgd.addClass("testrpc", TestRPC.class, "A test for rpc."); + pgd.addClass("testipc", TestIPC.class, "A test for ipc."); + } catch(Throwable e) { + e.printStackTrace(); + } + } + + public void run(String argv[]) { + try { + pgd.driver(argv); + } catch(Throwable e) { + e.printStackTrace(); + } + } + + public static void main(String argv[]){ + new CoreTestDriver().run(argv); + } +} diff --git a/src/test/org/apache/hadoop/util/TestCyclicIteration.java b/src/test/org/apache/hadoop/util/TestCyclicIteration.java new file mode 100644 index 00000000000..7dfa4763e19 --- /dev/null +++ b/src/test/org/apache/hadoop/util/TestCyclicIteration.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.util; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; + +public class TestCyclicIteration extends junit.framework.TestCase { + public void testCyclicIteration() throws Exception { + for(int n = 0; n < 5; n++) { + checkCyclicIteration(n); + } + } + + private static void checkCyclicIteration(int numOfElements) { + //create a tree map + final NavigableMap map = new TreeMap(); + final Integer[] integers = new Integer[numOfElements]; + for(int i = 0; i < integers.length; i++) { + integers[i] = 2*i; + map.put(integers[i], integers[i]); + } + System.out.println("\n\nintegers=" + Arrays.asList(integers)); + System.out.println("map=" + map); + + //try starting everywhere + for(int start = -1; start <= 2*integers.length - 1; start++) { + //get a cyclic iteration + final List iteration = new ArrayList(); + for(Map.Entry e : new CyclicIteration(map, start)) { + iteration.add(e.getKey()); + } + System.out.println("start=" + start + ", iteration=" + iteration); + + //verify results + for(int i = 0; i < integers.length; i++) { + final int j = ((start+2)/2 + i)%integers.length; + assertEquals("i=" + i + ", j=" + j, iteration.get(i), integers[j]); + } + } + } +} diff --git a/src/test/org/apache/hadoop/util/TestGenericsUtil.java b/src/test/org/apache/hadoop/util/TestGenericsUtil.java new file mode 100644 index 00000000000..af494c909d1 --- /dev/null +++ b/src/test/org/apache/hadoop/util/TestGenericsUtil.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.util; + +import java.util.ArrayList; +import java.util.List; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; + +public class TestGenericsUtil extends TestCase { + + public void testToArray() { + + //test a list of size 10 + List list = new ArrayList(); + + for(int i=0; i<10; i++) { + list.add(i); + } + + Integer[] arr = GenericsUtil.toArray(list); + + for (int i = 0; i < arr.length; i++) { + assertEquals(list.get(i), arr[i]); + } + } + + public void testWithEmptyList() { + try { + List list = new ArrayList(); + String[] arr = GenericsUtil.toArray(list); + fail("Empty array should throw exception"); + System.out.println(arr); //use arr so that compiler will not complain + + }catch (IndexOutOfBoundsException ex) { + //test case is successful + } + } + + public void testWithEmptyList2() { + List list = new ArrayList(); + //this method should not throw IndexOutOfBoundsException + String[] arr = GenericsUtil.toArray(String.class, list); + + assertEquals(0, arr.length); + } + + /** This class uses generics */ + private class GenericClass { + T dummy; + List list = new ArrayList(); + + void add(T item) { + list.add(item); + } + + T[] funcThatUsesToArray() { + T[] arr = GenericsUtil.toArray(list); + return arr; + } + } + + public void testWithGenericClass() { + + GenericClass testSubject = new GenericClass(); + + testSubject.add("test1"); + testSubject.add("test2"); + + try { + //this cast would fail, if we had not used GenericsUtil.toArray, since the + //rmethod would return Object[] rather than String[] + String[] arr = testSubject.funcThatUsesToArray(); + + assertEquals("test1", arr[0]); + assertEquals("test2", arr[1]); + + }catch (ClassCastException ex) { + fail("GenericsUtil#toArray() is not working for generic classes"); + } + + } + + public void testGenericOptionsParser() throws Exception { + GenericOptionsParser parser = new GenericOptionsParser( + new Configuration(), new String[] {"-jt"}); + assertEquals(parser.getRemainingArgs().length, 0); + } + + public void testGetClass() { + + //test with Integer + Integer x = new Integer(42); + Class c = GenericsUtil.getClass(x); + assertEquals(Integer.class, c); + + //test with GenericClass + GenericClass testSubject = new GenericClass(); + Class> c2 = GenericsUtil.getClass(testSubject); + assertEquals(GenericClass.class, c2); + } + +} diff --git a/src/test/org/apache/hadoop/util/TestIndexedSort.java b/src/test/org/apache/hadoop/util/TestIndexedSort.java new file mode 100644 index 00000000000..d806a0adce9 --- /dev/null +++ b/src/test/org/apache/hadoop/util/TestIndexedSort.java @@ -0,0 +1,361 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.util; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.DataInputBuffer; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparator; + +public class TestIndexedSort extends TestCase { + + public void sortAllEqual(IndexedSorter sorter) throws Exception { + final int SAMPLE = 500; + int[] values = new int[SAMPLE]; + Arrays.fill(values, 10); + SampleSortable s = new SampleSortable(values); + sorter.sort(s, 0, SAMPLE); + int[] check = s.getSorted(); + assertTrue(Arrays.toString(values) + "\ndoesn't match\n" + + Arrays.toString(check), Arrays.equals(values, check)); + // Set random min/max, re-sort. + Random r = new Random(); + int min = r.nextInt(SAMPLE); + int max = (min + 1 + r.nextInt(SAMPLE - 2)) % SAMPLE; + values[min] = 9; + values[max] = 11; + System.out.println("testAllEqual setting min/max at " + min + "/" + max + + "(" + sorter.getClass().getName() + ")"); + s = new SampleSortable(values); + sorter.sort(s, 0, SAMPLE); + check = s.getSorted(); + Arrays.sort(values); + assertTrue(check[0] == 9); + assertTrue(check[SAMPLE - 1] == 11); + assertTrue(Arrays.toString(values) + "\ndoesn't match\n" + + Arrays.toString(check), Arrays.equals(values, check)); + } + + public void sortSorted(IndexedSorter sorter) throws Exception { + final int SAMPLE = 500; + int[] values = new int[SAMPLE]; + Random r = new Random(); + long seed = r.nextLong(); + r.setSeed(seed); + System.out.println("testSorted seed: " + seed + + "(" + sorter.getClass().getName() + ")"); + for (int i = 0; i < SAMPLE; ++i) { + values[i] = r.nextInt(100); + } + Arrays.sort(values); + SampleSortable s = new SampleSortable(values); + sorter.sort(s, 0, SAMPLE); + int[] check = s.getSorted(); + assertTrue(Arrays.toString(values) + "\ndoesn't match\n" + + Arrays.toString(check), Arrays.equals(values, check)); + } + + public void sortSequential(IndexedSorter sorter) throws Exception { + final int SAMPLE = 500; + int[] values = new int[SAMPLE]; + for (int i = 0; i < SAMPLE; ++i) { + values[i] = i; + } + SampleSortable s = new SampleSortable(values); + sorter.sort(s, 0, SAMPLE); + int[] check = s.getSorted(); + assertTrue(Arrays.toString(values) + "\ndoesn't match\n" + + Arrays.toString(check), Arrays.equals(values, check)); + } + + public void sortSingleRecord(IndexedSorter sorter) throws Exception { + final int SAMPLE = 1; + SampleSortable s = new SampleSortable(SAMPLE); + int[] values = s.getValues(); + sorter.sort(s, 0, SAMPLE); + int[] check = s.getSorted(); + assertTrue(Arrays.toString(values) + "\ndoesn't match\n" + + Arrays.toString(check), Arrays.equals(values, check)); + } + + public void sortRandom(IndexedSorter sorter) throws Exception { + final int SAMPLE = 256 * 1024; + SampleSortable s = new SampleSortable(SAMPLE); + long seed = s.getSeed(); + System.out.println("sortRandom seed: " + seed + + "(" + sorter.getClass().getName() + ")"); + int[] values = s.getValues(); + Arrays.sort(values); + sorter.sort(s, 0, SAMPLE); + int[] check = s.getSorted(); + assertTrue("seed: " + seed + "\ndoesn't match\n", + Arrays.equals(values, check)); + } + + public void sortWritable(IndexedSorter sorter) throws Exception { + final int SAMPLE = 1000; + WritableSortable s = new WritableSortable(SAMPLE); + long seed = s.getSeed(); + System.out.println("sortWritable seed: " + seed + + "(" + sorter.getClass().getName() + ")"); + String[] values = s.getValues(); + Arrays.sort(values); + sorter.sort(s, 0, SAMPLE); + String[] check = s.getSorted(); + assertTrue("seed: " + seed + "\ndoesn't match", + Arrays.equals(values, check)); + } + + + public void testQuickSort() throws Exception { + QuickSort sorter = new QuickSort(); + sortRandom(sorter); + sortSingleRecord(sorter); + sortSequential(sorter); + sortSorted(sorter); + sortAllEqual(sorter); + sortWritable(sorter); + + // test degenerate case for median-of-three partitioning + // a_n, a_1, a_2, ..., a_{n-1} + final int DSAMPLE = 500; + int[] values = new int[DSAMPLE]; + for (int i = 0; i < DSAMPLE; ++i) { values[i] = i; } + values[0] = values[DSAMPLE - 1] + 1; + SampleSortable s = new SampleSortable(values); + values = s.getValues(); + final int DSS = (DSAMPLE / 2) * (DSAMPLE / 2); + // Worst case is (N/2)^2 comparisons, not including those effecting + // the median-of-three partitioning; impl should handle this case + MeasuredSortable m = new MeasuredSortable(s, DSS); + sorter.sort(m, 0, DSAMPLE); + System.out.println("QuickSort degen cmp/swp: " + + m.getCmp() + "/" + m.getSwp() + + "(" + sorter.getClass().getName() + ")"); + Arrays.sort(values); + int[] check = s.getSorted(); + assertTrue(Arrays.equals(values, check)); + } + + public void testHeapSort() throws Exception { + HeapSort sorter = new HeapSort(); + sortRandom(sorter); + sortSingleRecord(sorter); + sortSequential(sorter); + sortSorted(sorter); + sortAllEqual(sorter); + sortWritable(sorter); + } + + // Sortables // + + private static class SampleSortable implements IndexedSortable { + private int[] valindex; + private int[] valindirect; + private int[] values; + private final long seed; + + public SampleSortable() { + this(50); + } + + public SampleSortable(int j) { + Random r = new Random(); + seed = r.nextLong(); + r.setSeed(seed); + values = new int[j]; + valindex = new int[j]; + valindirect = new int[j]; + for (int i = 0; i < j; ++i) { + valindex[i] = valindirect[i] = i; + values[i] = r.nextInt(1000); + } + } + + public SampleSortable(int[] values) { + this.values = values; + valindex = new int[values.length]; + valindirect = new int[values.length]; + for (int i = 0; i < values.length; ++i) { + valindex[i] = valindirect[i] = i; + } + seed = 0; + } + + public long getSeed() { + return seed; + } + + public int compare(int i, int j) { + // assume positive + return + values[valindirect[valindex[i]]] - values[valindirect[valindex[j]]]; + } + + public void swap(int i, int j) { + int tmp = valindex[i]; + valindex[i] = valindex[j]; + valindex[j] = tmp; + } + + public int[] getSorted() { + int[] ret = new int[values.length]; + for (int i = 0; i < ret.length; ++i) { + ret[i] = values[valindirect[valindex[i]]]; + } + return ret; + } + + public int[] getValues() { + int[] ret = new int[values.length]; + System.arraycopy(values, 0, ret, 0, values.length); + return ret; + } + + } + + public static class MeasuredSortable implements IndexedSortable { + + private int comparisions; + private int swaps; + private final int maxcmp; + private final int maxswp; + private IndexedSortable s; + + public MeasuredSortable(IndexedSortable s) { + this(s, Integer.MAX_VALUE); + } + + public MeasuredSortable(IndexedSortable s, int maxcmp) { + this(s, maxcmp, Integer.MAX_VALUE); + } + + public MeasuredSortable(IndexedSortable s, int maxcmp, int maxswp) { + this.s = s; + this.maxcmp = maxcmp; + this.maxswp = maxswp; + } + + public int getCmp() { return comparisions; } + public int getSwp() { return swaps; } + + public int compare(int i, int j) { + assertTrue("Expected fewer than " + maxcmp + " comparisons", + ++comparisions < maxcmp); + return s.compare(i, j); + } + + public void swap(int i, int j) { + assertTrue("Expected fewer than " + maxswp + " swaps", + ++swaps < maxswp); + s.swap(i, j); + } + + } + + private static class WritableSortable implements IndexedSortable { + + private static Random r = new Random(); + private final int eob; + private final int[] indices; + private final int[] offsets; + private final byte[] bytes; + private final WritableComparator comparator; + private final String[] check; + private final long seed; + + public WritableSortable() throws IOException { + this(100); + } + + public WritableSortable(int j) throws IOException { + seed = r.nextLong(); + r.setSeed(seed); + Text t = new Text(); + StringBuffer sb = new StringBuffer(); + indices = new int[j]; + offsets = new int[j]; + check = new String[j]; + DataOutputBuffer dob = new DataOutputBuffer(); + for (int i = 0; i < j; ++i) { + indices[i] = i; + offsets[i] = dob.getLength(); + genRandom(t, r.nextInt(15) + 1, sb); + t.write(dob); + check[i] = t.toString(); + } + eob = dob.getLength(); + bytes = dob.getData(); + comparator = WritableComparator.get(Text.class); + } + + public long getSeed() { + return seed; + } + + private static void genRandom(Text t, int len, StringBuffer sb) { + sb.setLength(0); + for (int i = 0; i < len; ++i) { + sb.append(Integer.toString(r.nextInt(26) + 10, 36)); + } + t.set(sb.toString()); + } + + public int compare(int i, int j) { + final int ii = indices[i]; + final int ij = indices[j]; + return comparator.compare(bytes, offsets[ii], + ((ii + 1 == indices.length) ? eob : offsets[ii + 1]) - offsets[ii], + bytes, offsets[ij], + ((ij + 1 == indices.length) ? eob : offsets[ij + 1]) - offsets[ij]); + } + + public void swap(int i, int j) { + int tmp = indices[i]; + indices[i] = indices[j]; + indices[j] = tmp; + } + + public String[] getValues() { + return check; + } + + public String[] getSorted() throws IOException { + String[] ret = new String[indices.length]; + Text t = new Text(); + DataInputBuffer dib = new DataInputBuffer(); + for (int i = 0; i < ret.length; ++i) { + int ii = indices[i]; + dib.reset(bytes, offsets[ii], + ((ii + 1 == indices.length) ? eob : offsets[ii + 1]) - offsets[ii]); + t.readFields(dib); + ret[i] = t.toString(); + } + return ret; + } + + } + +} diff --git a/src/test/org/apache/hadoop/util/TestProcfsBasedProcessTree.java b/src/test/org/apache/hadoop/util/TestProcfsBasedProcessTree.java new file mode 100644 index 00000000000..0b975074026 --- /dev/null +++ b/src/test/org/apache/hadoop/util/TestProcfsBasedProcessTree.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Random; +import java.util.Vector; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.Shell.ExitCodeException; +import org.apache.hadoop.util.Shell.ShellCommandExecutor; + +import junit.framework.TestCase; + +/** + * A JUnit test to test ProcfsBasedProcessTree. + */ +public class TestProcfsBasedProcessTree extends TestCase { + + private static final Log LOG = LogFactory + .getLog(TestProcfsBasedProcessTree.class); + private static String TEST_ROOT_DIR = new Path(System.getProperty( + "test.build.data", "/tmp")).toString().replace(' ', '+'); + + private ShellCommandExecutor shexec = null; + private String pidFile, lowestDescendant; + private String shellScript; + private static final int N = 6; // Controls the RogueTask + + private class RogueTaskThread extends Thread { + public void run() { + try { + Vector args = new Vector(); + if(ProcessTree.isSetsidAvailable) { + args.add("setsid"); + } + args.add("bash"); + args.add("-c"); + args.add(" echo $$ > " + pidFile + "; sh " + + shellScript + " " + N + ";") ; + shexec = new ShellCommandExecutor(args.toArray(new String[0])); + shexec.execute(); + } catch (ExitCodeException ee) { + LOG.info("Shell Command exit with a non-zero exit code. This is" + + " expected as we are killing the subprocesses of the" + + " task intentionally. " + ee); + } catch (IOException ioe) { + LOG.info("Error executing shell command " + ioe); + } finally { + LOG.info("Exit code: " + shexec.getExitCode()); + } + } + } + + private String getRogueTaskPID() { + File f = new File(pidFile); + while (!f.exists()) { + try { + Thread.sleep(500); + } catch (InterruptedException ie) { + break; + } + } + + // read from pidFile + return getPidFromPidFile(pidFile); + } + + public void testProcessTree() { + + try { + if (!ProcfsBasedProcessTree.isAvailable()) { + System.out + .println("ProcfsBasedProcessTree is not available on this system. Not testing"); + return; + } + } catch (Exception e) { + LOG.info(StringUtils.stringifyException(e)); + return; + } + // create shell script + Random rm = new Random(); + File tempFile = new File(TEST_ROOT_DIR, this.getName() + "_shellScript_" + + rm.nextInt() + ".sh"); + tempFile.deleteOnExit(); + shellScript = TEST_ROOT_DIR + File.separator + tempFile.getName(); + + // create pid file + tempFile = new File(TEST_ROOT_DIR, this.getName() + "_pidFile_" + + rm.nextInt() + ".pid"); + tempFile.deleteOnExit(); + pidFile = TEST_ROOT_DIR + File.separator + tempFile.getName(); + + lowestDescendant = TEST_ROOT_DIR + File.separator + "lowestDescendantPidFile"; + + // write to shell-script + try { + FileWriter fWriter = new FileWriter(shellScript); + fWriter.write( + "# rogue task\n" + + "sleep 1\n" + + "echo hello\n" + + "if [ $1 -ne 0 ]\n" + + "then\n" + + " sh " + shellScript + " $(($1-1))\n" + + "else\n" + + " echo $$ > " + lowestDescendant + "\n" + + " while true\n do\n" + + " sleep 5\n" + + " done\n" + + "fi"); + fWriter.close(); + } catch (IOException ioe) { + LOG.info("Error: " + ioe); + return; + } + + Thread t = new RogueTaskThread(); + t.start(); + String pid = getRogueTaskPID(); + LOG.info("Root process pid: " + pid); + ProcfsBasedProcessTree p = new ProcfsBasedProcessTree(pid, + ProcessTree.isSetsidAvailable, + ProcessTree.DEFAULT_SLEEPTIME_BEFORE_SIGKILL); + p = p.getProcessTree(); // initialize + LOG.info("ProcessTree: " + p.toString()); + + File leaf = new File(lowestDescendant); + //wait till lowest descendant process of Rougue Task starts execution + while (!leaf.exists()) { + try { + Thread.sleep(500); + } catch (InterruptedException ie) { + break; + } + } + + p = p.getProcessTree(); // reconstruct + LOG.info("ProcessTree: " + p.toString()); + + // destroy the map task and all its subprocesses + p.destroy(true/*in the background*/); + + if(ProcessTree.isSetsidAvailable) {// whole processtree should be gone + assertEquals(false, p.isAnyProcessInTreeAlive()); + } + else {// process should be gone + assertFalse("ProcessTree must have been gone", p.isAlive()); + } + // Not able to join thread sometimes when forking with large N. + try { + t.join(2000); + LOG.info("RogueTaskThread successfully joined."); + } catch (InterruptedException ie) { + LOG.info("Interrupted while joining RogueTaskThread."); + } + + // ProcessTree is gone now. Any further calls should be sane. + p = p.getProcessTree(); + assertFalse("ProcessTree must have been gone", p.isAlive()); + assertTrue("Cumulative vmem for the gone-process is " + + p.getCumulativeVmem() + " . It should be zero.", p + .getCumulativeVmem() == 0); + assertTrue(p.toString().equals("[ ]")); + } + + /** + * Get PID from a pid-file. + * + * @param pidFileName + * Name of the pid-file. + * @return the PID string read from the pid-file. Returns null if the + * pidFileName points to a non-existing file or if read fails from the + * file. + */ + public static String getPidFromPidFile(String pidFileName) { + BufferedReader pidFile = null; + FileReader fReader = null; + String pid = null; + + try { + fReader = new FileReader(pidFileName); + pidFile = new BufferedReader(fReader); + } catch (FileNotFoundException f) { + LOG.debug("PidFile doesn't exist : " + pidFileName); + return pid; + } + + try { + pid = pidFile.readLine(); + } catch (IOException i) { + LOG.error("Failed to read from " + pidFileName); + } finally { + try { + if (fReader != null) { + fReader.close(); + } + try { + if (pidFile != null) { + pidFile.close(); + } + } catch (IOException i) { + LOG.warn("Error closing the stream " + pidFile); + } + } catch (IOException i) { + LOG.warn("Error closing the stream " + fReader); + } + } + return pid; + } +} diff --git a/src/test/org/apache/hadoop/util/TestShell.java b/src/test/org/apache/hadoop/util/TestShell.java new file mode 100644 index 00000000000..ca7303187bc --- /dev/null +++ b/src/test/org/apache/hadoop/util/TestShell.java @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.util; + +import junit.framework.TestCase; + +import java.io.BufferedReader; +import java.io.IOException; + +public class TestShell extends TestCase { + + private static class Command extends Shell { + private int runCount = 0; + + private Command(long interval) { + super(interval); + } + + protected String[] getExecString() { + return new String[] {"echo", "hello"}; + } + + protected void parseExecResult(BufferedReader lines) throws IOException { + ++runCount; + } + + public int getRunCount() { + return runCount; + } + } + + public void testInterval() throws IOException { + testInterval(Long.MIN_VALUE / 60000); // test a negative interval + testInterval(0L); // test a zero interval + testInterval(10L); // interval equal to 10mins + testInterval(System.currentTimeMillis() / 60000 + 60); // test a very big interval + } + + /** + * Assert that a string has a substring in it + * @param string string to search + * @param search what to search for it + */ + private void assertInString(String string, String search) { + assertNotNull("Empty String", string); + if (!string.contains(search)) { + fail("Did not find \"" + search + "\" in " + string); + } + } + + public void testShellCommandExecutorToString() throws Throwable { + Shell.ShellCommandExecutor sce=new Shell.ShellCommandExecutor( + new String[] { "ls","..","arg 2"}); + String command = sce.toString(); + assertInString(command,"ls"); + assertInString(command, " .. "); + assertInString(command, "\"arg 2\""); + } + + private void testInterval(long interval) throws IOException { + Command command = new Command(interval); + + command.run(); + assertEquals(1, command.getRunCount()); + + command.run(); + if (interval > 0) { + assertEquals(1, command.getRunCount()); + } else { + assertEquals(2, command.getRunCount()); + } + } +} diff --git a/src/test/org/apache/hadoop/util/TestStringUtils.java b/src/test/org/apache/hadoop/util/TestStringUtils.java new file mode 100644 index 00000000000..e68609ae2ff --- /dev/null +++ b/src/test/org/apache/hadoop/util/TestStringUtils.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.util; + +import junit.framework.TestCase; + +public class TestStringUtils extends TestCase { + final private static String NULL_STR = null; + final private static String EMPTY_STR = ""; + final private static String STR_WO_SPECIAL_CHARS = "AB"; + final private static String STR_WITH_COMMA = "A,B"; + final private static String ESCAPED_STR_WITH_COMMA = "A\\,B"; + final private static String STR_WITH_ESCAPE = "AB\\"; + final private static String ESCAPED_STR_WITH_ESCAPE = "AB\\\\"; + final private static String STR_WITH_BOTH2 = ",A\\,,B\\\\,"; + final private static String ESCAPED_STR_WITH_BOTH2 = + "\\,A\\\\\\,\\,B\\\\\\\\\\,"; + + public void testEscapeString() throws Exception { + assertEquals(NULL_STR, StringUtils.escapeString(NULL_STR)); + assertEquals(EMPTY_STR, StringUtils.escapeString(EMPTY_STR)); + assertEquals(STR_WO_SPECIAL_CHARS, + StringUtils.escapeString(STR_WO_SPECIAL_CHARS)); + assertEquals(ESCAPED_STR_WITH_COMMA, + StringUtils.escapeString(STR_WITH_COMMA)); + assertEquals(ESCAPED_STR_WITH_ESCAPE, + StringUtils.escapeString(STR_WITH_ESCAPE)); + assertEquals(ESCAPED_STR_WITH_BOTH2, + StringUtils.escapeString(STR_WITH_BOTH2)); + } + + public void testSplit() throws Exception { + assertEquals(NULL_STR, StringUtils.split(NULL_STR)); + String[] splits = StringUtils.split(EMPTY_STR); + assertEquals(0, splits.length); + splits = StringUtils.split(",,"); + assertEquals(0, splits.length); + splits = StringUtils.split(STR_WO_SPECIAL_CHARS); + assertEquals(1, splits.length); + assertEquals(STR_WO_SPECIAL_CHARS, splits[0]); + splits = StringUtils.split(STR_WITH_COMMA); + assertEquals(2, splits.length); + assertEquals("A", splits[0]); + assertEquals("B", splits[1]); + splits = StringUtils.split(ESCAPED_STR_WITH_COMMA); + assertEquals(1, splits.length); + assertEquals(ESCAPED_STR_WITH_COMMA, splits[0]); + splits = StringUtils.split(STR_WITH_ESCAPE); + assertEquals(1, splits.length); + assertEquals(STR_WITH_ESCAPE, splits[0]); + splits = StringUtils.split(STR_WITH_BOTH2); + assertEquals(3, splits.length); + assertEquals(EMPTY_STR, splits[0]); + assertEquals("A\\,", splits[1]); + assertEquals("B\\\\", splits[2]); + splits = StringUtils.split(ESCAPED_STR_WITH_BOTH2); + assertEquals(1, splits.length); + assertEquals(ESCAPED_STR_WITH_BOTH2, splits[0]); + } + + public void testUnescapeString() throws Exception { + assertEquals(NULL_STR, StringUtils.unEscapeString(NULL_STR)); + assertEquals(EMPTY_STR, StringUtils.unEscapeString(EMPTY_STR)); + assertEquals(STR_WO_SPECIAL_CHARS, + StringUtils.unEscapeString(STR_WO_SPECIAL_CHARS)); + try { + StringUtils.unEscapeString(STR_WITH_COMMA); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected + } + assertEquals(STR_WITH_COMMA, + StringUtils.unEscapeString(ESCAPED_STR_WITH_COMMA)); + try { + StringUtils.unEscapeString(STR_WITH_ESCAPE); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected + } + assertEquals(STR_WITH_ESCAPE, + StringUtils.unEscapeString(ESCAPED_STR_WITH_ESCAPE)); + try { + StringUtils.unEscapeString(STR_WITH_BOTH2); + fail("Should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected + } + assertEquals(STR_WITH_BOTH2, + StringUtils.unEscapeString(ESCAPED_STR_WITH_BOTH2)); + } + + public void testTraditionalBinaryPrefix() throws Exception { + String[] symbol = {"k", "m", "g", "t", "p", "e"}; + long m = 1024; + for(String s : symbol) { + assertEquals(0, StringUtils.TraditionalBinaryPrefix.string2long(0 + s)); + assertEquals(m, StringUtils.TraditionalBinaryPrefix.string2long(1 + s)); + m *= 1024; + } + + assertEquals(0L, StringUtils.TraditionalBinaryPrefix.string2long("0")); + assertEquals(-1259520L, StringUtils.TraditionalBinaryPrefix.string2long("-1230k")); + assertEquals(956703965184L, StringUtils.TraditionalBinaryPrefix.string2long("891g")); + } +}