diff --git a/lib/commons-cli-2.0-SNAPSHOT.jar b/lib/commons-cli-2.0-SNAPSHOT.jar
new file mode 100644
index 00000000000..0b1d51072a7
Binary files /dev/null and b/lib/commons-cli-2.0-SNAPSHOT.jar differ
diff --git a/lib/hsqldb-1.8.0.10.LICENSE.txt b/lib/hsqldb-1.8.0.10.LICENSE.txt
new file mode 100644
index 00000000000..d45b9f8cc07
--- /dev/null
+++ b/lib/hsqldb-1.8.0.10.LICENSE.txt
@@ -0,0 +1,66 @@
+/* Copyright (c) 1995-2000, The Hypersonic SQL Group.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of the Hypersonic SQL Group nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE HYPERSONIC SQL GROUP,
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This software consists of voluntary contributions made by many individuals
+ * on behalf of the Hypersonic SQL Group.
+ *
+ *
+ * For work added by the HSQL Development Group:
+ *
+ * Copyright (c) 2001-2004, The HSQL Development Group
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of the HSQL Development Group nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HSQL DEVELOPMENT GROUP, HSQLDB.ORG,
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
diff --git a/lib/hsqldb-1.8.0.10.jar b/lib/hsqldb-1.8.0.10.jar
new file mode 100644
index 00000000000..e010269ddf6
Binary files /dev/null and b/lib/hsqldb-1.8.0.10.jar differ
diff --git a/lib/jdiff/hadoop_0.17.0.xml b/lib/jdiff/hadoop_0.17.0.xml
new file mode 100644
index 00000000000..69dded31403
--- /dev/null
+++ b/lib/jdiff/hadoop_0.17.0.xml
@@ -0,0 +1,43272 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The balancer is a tool that balances disk space usage on an HDFS cluster
+ when some datanodes become full or when new empty nodes join the cluster.
+ The tool is deployed as an application program that can be run by the
+ cluster administrator on a live HDFS cluster while applications
+ adding and deleting files.
+
+
SYNOPSIS
+
+ To start:
+ bin/start-balancer.sh [-threshold ]
+ Example: bin/ start-balancer.sh
+ start the balancer with a default threshold of 10%
+ bin/ start-balancer.sh -threshold 5
+ start the balancer with a threshold of 5%
+ To stop:
+ bin/ stop-balancer.sh
+
+
+
DESCRIPTION
+
The threshold parameter is a fraction in the range of (0%, 100%) with a
+ default value of 10%. The threshold sets a target for whether the cluster
+ is balanced. A cluster is balanced if for each datanode, the utilization
+ of the node (ratio of used space at the node to total capacity of the node)
+ differs from the utilization of the (ratio of used space in the cluster
+ to total capacity of the cluster) by no more than the threshold value.
+ The smaller the threshold, the more balanced a cluster will become.
+ It takes more time to run the balancer for small threshold values.
+ Also for a very small threshold the cluster may not be able to reach the
+ balanced state when applications write and delete files concurrently.
+
+
The tool moves blocks from highly utilized datanodes to poorly
+ utilized datanodes iteratively. In each iteration a datanode moves or
+ receives no more than the lesser of 10G bytes or the threshold fraction
+ of its capacity. Each iteration runs no more than 20 minutes.
+ At the end of each iteration, the balancer obtains updated datanodes
+ information from the namenode.
+
+
A system property that limits the balancer's use of bandwidth is
+ defined in the default configuration file:
+
+
+ dfs.balance.bandwidthPerSec
+ 1048576
+ Specifies the maximum bandwidth that each datanode
+ can utilize for the balancing purpose in term of the number of bytes
+ per second.
+
+
+
+
This property determines the maximum speed at which a block will be
+ moved from one datanode to another. The default value is 1MB/s. The higher
+ the bandwidth, the faster a cluster can reach the balanced state,
+ but with greater competition with application processes. If an
+ administrator changes the value of this property in the configuration
+ file, the change is observed when HDFS is next restarted.
+
+
MONITERING BALANCER PROGRESS
+
After the balancer is started, an output file name where the balancer
+ progress will be recorded is printed on the screen. The administrator
+ can monitor the running of the balancer by reading the output file.
+ The output shows the balancer's status iteration by iteration. In each
+ iteration it prints the starting time, the iteration number, the total
+ number of bytes that have been moved in the previous iterations,
+ the total number of bytes that are left to move in order for the cluster
+ to be balanced, and the number of bytes that are being moved in this
+ iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left
+ To Move" is decreasing.
+
+
Running multiple instances of the balancer in an HDFS cluster is
+ prohibited by the tool.
+
+
The balancer automatically exits when any of the following five
+ conditions is satisfied:
+
+
The cluster is balanced;
+
No block can be moved;
+
No block has been moved for five consecutive iterations;
+
An IOException occurs while communicating with the namenode;
+
Another balancer is running.
+
+
+
Upon exit, a balancer returns an exit code and prints one of the
+ following messages to the output file in corresponding to the above exit
+ reasons:
+
+
The cluster is balanced. Exiting
+
No block can be moved. Exiting...
+
No block has been moved for 3 iterations. Exiting...
+
Received an IO exception: failure reason. Exiting...
+
Another balancer is running. Exiting...
+
+
+
The administrator can interrupt the execution of the balancer at any
+ time by running the command "stop-balancer.sh" on the machine where the
+ balancer is running.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ stream of bytes (of BLOCK_SIZE or less)
+
+ This info is stored on a local disk. The DataNode
+ reports the table's contents to the NameNode upon startup
+ and every so often afterwards.
+
+ DataNodes spend their lives in an endless loop of asking
+ the NameNode for something to do. A NameNode cannot connect
+ to a DataNode directly; a NameNode simply returns values from
+ functions invoked by a DataNode.
+
+ DataNodes maintain an open server socket so that client code
+ or other DataNodes can read/write data. The host/port for
+ this server is reported to the NameNode, which then sends that
+ information to clients or other DataNodes that might be interested.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The tool scans all files and directories, starting from an indicated
+ root path. The following abnormal conditions are detected and handled:
+
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link NamenodeFsck#FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link NamenodeFsck#FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
{@link FSConstants.StartupOption#REGULAR REGULAR} - normal startup
+
{@link FSConstants.StartupOption#FORMAT FORMAT} - format name node
+
{@link FSConstants.StartupOption#UPGRADE UPGRADE} - start the cluster
+ upgrade and create a snapshot of the current file system state
+
{@link FSConstants.StartupOption#ROLLBACK ROLLBACK} - roll the
+ cluster back to the previous state
+
+ The option is passed via configuration field:
+ dfs.namenode.startup
+
+ The conf will be modified to reflect the actual ports on which
+ the NameNode is up and running if the user passes the port as
+ zero in the conf.
+
+ @param conf confirguration
+ @throws IOException]]>
+
+
+
+
+
+ zero.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ datanode whose
+ total size is size
+
+ @param datanode on which blocks are located
+ @param size total size of blocks]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ blocksequence (namespace)
+ 2) block->machinelist ("inodes")
+
+ The first table is stored on disk and is very precious.
+ The second table is rebuilt every time the NameNode comes
+ up.
+
+ 'NameNode' refers to both this class as well as the 'NameNode server'.
+ The 'FSNamesystem' class actually performs most of the filesystem
+ management. The majority of the 'NameNode' class itself is concerned
+ with exposing the IPC interface to the outside world, plus some
+ configuration management.
+
+ NameNode implements the ClientProtocol interface, which allows
+ clients to ask for DFS services. ClientProtocol is not
+ designed for direct use by authors of DFS client code. End-users
+ should instead use the org.apache.nutch.hadoop.fs.FileSystem class.
+
+ NameNode also implements the DatanodeProtocol interface, used by
+ DataNode programs that actually store DFS data blocks. These
+ methods are invoked repeatedly and automatically by all the
+ DataNodes in a DFS deployment.
+
+ NameNode also implements the NamenodeProtocol interface, used by
+ secondary namenodes or rebalancing processes to get partial namenode's
+ state, for example partial blocksMap etc.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The tool scans all files and directories, starting from an indicated
+ root path. The following abnormal conditions are detected and handled:
+
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link #FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link #FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
+
delete corrupted files ({@link #FIXING_DELETE})
+
+
+
detect files with under-replicated or over-replicated blocks
+
+ Additionally, the tool collects a detailed overall DFS statistics, and
+ optionally can print detailed statistics on block locations and replication
+ factors of each file.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
The most important difference is that unlike GFS, Hadoop DFS files
+have strictly one writer at any one time. Bytes are always appended
+to the end of the writer's stream. There is no notion of "record appends"
+or "mutations" that are then checked or reordered. Writers simply emit
+a byte stream. That byte stream is guaranteed to be stored in the
+order written.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip files) are un-archived at the slave nodes. Jars maybe be
+ optionally added to the classpath of the tasks, a rudimentary software
+ distribution mechanism. Files have execution permissions. Optionally users
+ can also direct it to symlink the distributed cache file(s) into
+ the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+This pages describes how to use Kosmos Filesystem
+( KFS ) as a backing
+store with Hadoop. This page assumes that you have downloaded the
+KFS software and installed necessary binaries as outlined in the KFS
+documentation.
+
+
Steps
+
+
+
In the Hadoop conf directory edit hadoop-default.xml,
+ add the following:
+
In the Hadoop conf directory edit hadoop-site.xml,
+ adding the following (with appropriate values for
+ <server> and <port>):
+
+<property>
+ <name>fs.default.name</name>
+ <value>kfs://<server:port></value>
+</property>
+
+<property>
+ <name>fs.kfs.metaServerHost</name>
+ <value><server></value>
+ <description>The location of the KFS meta server.</description>
+</property>
+
+<property>
+ <name>fs.kfs.metaServerPort</name>
+ <value><port></value>
+ <description>The location of the meta server's port.</description>
+</property>
+
+
+
+
+
Copy KFS's kfs-0.1.jar to Hadoop's lib directory. This step
+ enables Hadoop's to load the KFS specific modules. Note
+ that, kfs-0.1.jar was built when you compiled KFS source
+ code. This jar file contains code that calls KFS's client
+ library code via JNI; the native code is in KFS's
+ libkfsClient.so library.
+
+
+
When the Hadoop map/reduce trackers start up, those
+processes (on local as well as remote nodes) will now need to load
+KFS's libkfsClient.so library. To simplify this process, it is advisable to
+store libkfsClient.so in an NFS accessible directory (similar to where
+Hadoop binaries/scripts are stored); then, modify Hadoop's
+conf/hadoop-env.sh adding the following line and providing suitable
+value for <path>:
+
+export LD_LIBRARY_PATH=<path>
+
+
+
+
Start only the map/reduce trackers
+
+ example: execute Hadoop's bin/start-mapred.sh
+Files are stored in S3 as blocks (represented by
+{@link org.apache.hadoop.fs.s3.Block}), which have an ID and a length.
+Block metadata is stored in S3 as a small record (represented by
+{@link org.apache.hadoop.fs.s3.INode}) using the URL-encoded
+path string as a key. Inodes record the file type (regular file or directory) and the list of blocks.
+This design makes it easy to seek to any given position in a file by reading the inode data to compute
+which block to access, then using S3's support for
+HTTP Range headers
+to start streaming from the correct position.
+Renames are also efficient since only the inode is moved (by a DELETE followed by a PUT since
+S3 does not support renames).
+
+
+For a single file /dir1/file1 which takes two blocks of storage, the file structure in S3
+would be something like this:
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+A mechanism for selectively retrying methods that throw exceptions under certain circumstances.
+
+
+
+This will retry any method called on unreliable four times - in this case the call()
+method - sleeping 10 seconds between
+each retry. There are a number of {@link org.apache.hadoop.io.retry.RetryPolicies retry policies}
+available, or you can implement a custom one by implementing {@link org.apache.hadoop.io.retry.RetryPolicy}.
+It is also possible to specify retry policies on a
+{@link org.apache.hadoop.io.retry.RetryProxy#create(Class, Object, Map) per-method basis}.
+
]]>
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This package provides a mechanism for using different serialization frameworks
+in Hadoop. The property "io.serializations" defines a list of
+{@link org.apache.hadoop.io.serializer.Serialization}s that know how to create
+{@link org.apache.hadoop.io.serializer.Serializer}s and
+{@link org.apache.hadoop.io.serializer.Deserializer}s.
+
+
+
+To add a new serialization framework write an implementation of
+{@link org.apache.hadoop.io.serializer.Serialization} and add its name to the
+"io.serializations" property.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcDiscardedOps}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides generic implementations of
+ {@link #validateInput(JobConf)} and {@link #getSplits(JobConf, int)}.
+ Implementations fo FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the taskid, say
+ task_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. Typically all values are combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
A Map-Reduce job usually splits the input data-set into independent
+chunks which processed by map tasks in completely parallel manner,
+followed by reduce tasks which aggregating their output. Typically both
+the input and the output of the job are stored in a
+{@link org.apache.hadoop.fs.FileSystem}. The framework takes care of monitoring
+tasks and re-executing failed ones. Since, usually, the compute nodes and the
+storage nodes are the same i.e. Hadoop's Map-Reduce framework and Distributed
+FileSystem are running on the same set of nodes, tasks are effectively scheduled
+on the nodes where data is already present, resulting in very high aggregate
+bandwidth across the cluster.
+
+
The Map-Reduce framework operates exclusively on <key, value>
+pairs i.e. the input to the job is viewed as a set of <key, value>
+pairs and the output as another, possibly different, set of
+<key, value> pairs. The keys and values have to
+be serializable as {@link org.apache.hadoop.io.Writable}s and additionally the
+keys have to be {@link org.apache.hadoop.io.WritableComparable}s in
+order to facilitate grouping by the framework.
+
+
Data flow:
+
+ (input)
+ <k1, v1>
+
+ |
+ V
+
+ map
+
+ |
+ V
+
+ <k2, v2>
+
+ |
+ V
+
+ combine
+
+ |
+ V
+
+ <k2, v2>
+
+ |
+ V
+
+ reduce
+
+ |
+ V
+
+ <k3, v3>
+ (output)
+
+
+
Applications typically implement
+{@link org.apache.hadoop.mapred.Mapper#map(Object, Object, OutputCollector, Reporter)}
+and
+{@link org.apache.hadoop.mapred.Reducer#reduce(Object, Iterator, OutputCollector, Reporter)}
+methods. The application-writer also specifies various facets of the job such
+as input and output locations, the Partitioner, InputFormat
+& OutputFormat implementations to be used etc. as
+a {@link org.apache.hadoop.mapred.JobConf}. The client program,
+{@link org.apache.hadoop.mapred.JobClient}, then submits the job to the framework
+and optionally monitors it.
+
+
The framework spawns one map task per
+{@link org.apache.hadoop.mapred.InputSplit} generated by the
+{@link org.apache.hadoop.mapred.InputFormat} of the job and calls
+{@link org.apache.hadoop.mapred.Mapper#map(Object, Object, OutputCollector, Reporter)}
+with each <key, value> pair read by the
+{@link org.apache.hadoop.mapred.RecordReader} from the InputSplit for
+the task. The intermediate outputs of the maps are then grouped by keys
+and optionally aggregated by combiner. The key space of intermediate
+outputs are paritioned by the {@link org.apache.hadoop.mapred.Partitioner}, where
+the number of partitions is exactly the number of reduce tasks for the job.
+
+
The reduce tasks fetch the sorted intermediate outputs of the maps, via http,
+merge the <key, value> pairs and call
+{@link org.apache.hadoop.mapred.Reducer#reduce(Object, Iterator, OutputCollector, Reporter)}
+for each <key, list of values> pair. The output of the reduce tasks' is
+stored on the FileSystem by the
+{@link org.apache.hadoop.mapred.RecordWriter} provided by the
+{@link org.apache.hadoop.mapred.OutputFormat} of the job.
+
+
Map-Reduce application to perform a distributed grep:
+
+public class Grep extends Configured implements Tool {
+
+ // map: Search for the pattern specified by 'grep.mapper.regex' &
+ // 'grep.mapper.regex.group'
+
+ class GrepMapper<K, Text>
+ extends MapReduceBase implements Mapper<K, Text, Text, LongWritable> {
+
+ private Pattern pattern;
+ private int group;
+
+ public void configure(JobConf job) {
+ pattern = Pattern.compile(job.get("grep.mapper.regex"));
+ group = job.getInt("grep.mapper.regex.group", 0);
+ }
+
+ public void map(K key, Text value,
+ OutputCollector<Text, LongWritable> output,
+ Reporter reporter)
+ throws IOException {
+ String text = value.toString();
+ Matcher matcher = pattern.matcher(text);
+ while (matcher.find()) {
+ output.collect(new Text(matcher.group(group)), new LongWritable(1));
+ }
+ }
+ }
+
+ // reduce: Count the number of occurrences of the pattern
+
+ class GrepReducer<K> extends MapReduceBase
+ implements Reducer<K, LongWritable, K, LongWritable> {
+
+ public void reduce(K key, Iterator<LongWritable> values,
+ OutputCollector<K, LongWritable> output,
+ Reporter reporter)
+ throws IOException {
+
+ // sum all values for this key
+ long sum = 0;
+ while (values.hasNext()) {
+ sum += values.next().get();
+ }
+
+ // output sum
+ output.collect(key, new LongWritable(sum));
+ }
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 3) {
+ System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
+ ToolRunner.printGenericCommandUsage(System.out);
+ return -1;
+ }
+
+ JobConf grepJob = new JobConf(getConf(), Grep.class);
+
+ grepJob.setJobName("grep");
+
+ grepJob.setInputPath(new Path(args[0]));
+ grepJob.setOutputPath(args[1]);
+
+ grepJob.setMapperClass(GrepMapper.class);
+ grepJob.setCombinerClass(GrepReducer.class);
+ grepJob.setReducerClass(GrepReducer.class);
+
+ grepJob.set("mapred.mapper.regex", args[2]);
+ if (args.length == 4)
+ grepJob.set("mapred.mapper.regex.group", args[3]);
+
+ grepJob.setOutputFormat(SequenceFileOutputFormat.class);
+ grepJob.setOutputKeyClass(Text.class);
+ grepJob.setOutputValueClass(LongWritable.class);
+
+ JobClient.runJob(grepJob);
+
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(new Configuration(), new Grep(), args);
+ System.exit(res);
+ }
+
+}
+
+
+
Notice how the data-flow of the above grep job is very similar to doing the
+same via the unix pipeline:
+
+
+cat input/* | grep | sort | uniq -c > out
+
+
+
+ input | map | shuffle | reduce > out
+
+
+
Hadoop Map-Reduce applications need not be written in
+JavaTM only.
+Hadoop Streaming is a utility
+which allows users to create and run jobs with any executables (e.g. shell
+utilities) as the mapper and/or the reducer.
+Hadoop Pipes is a
+SWIG-compatible C++ API to implement
+Map-Reduce applications (non JNITM based).
Operations included in this patch are partitioned into one of two types:
+join operations emitting tuples and "multi-filter" operations emitting a
+single value from (but not necessarily included in) a set of input values.
+For a given key, each operation will consider the cross product of all
+values for all sources at that node.
+
+
Identifiers supported by default:
+
+
+
identifier
type
description
+
inner
Join
Full inner join
+
outer
Join
Full outer join
+
override
MultiFilter
+
For a given key, prefer values from the rightmost source
+
+
+
A user of this class must set the InputFormat for the job to
+CompositeInputFormat and define a join expression accepted by the
+preceding grammar. For example, both of the following are acceptable:
CompositeInputFormat includes a handful of convenience methods to
+aid construction of these verbose statements.
+
+
As in the second example, joins may be nested. Users may provide a
+comparator class in the mapred.join.keycomparator property to specify
+the ordering of their keys, or accept the default comparator as returned by
+WritableComparator.get(keyclass).
+
+
Users can specify their own join operations, typically by overriding
+JoinRecordReader or MultiFilterRecordReader and mapping that
+class to an identifier in the join expression using the
+mapred.join.define.ident property, where ident is
+the identifier appearing in the join expression. Users may elect to emit- or
+modify- values passing through their join operation. Consulting the existing
+operations for guidance is recommended. Adding arguments is considerably more
+complex (and only partially supported), as one must also add a Node
+type to the parse tree. One is probably better off extending
+RecordReader in most cases.
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+
+Generally speaking, in order to implement an application using Map/Reduce
+model, the developer needs to implement Map and Reduce functions (and possibly
+Combine function). However, for a lot of applications related to counting and
+statistics computing, these functions have very similar
+characteristics. This provides a package implementing
+those patterns. In particular, the package provides a generic mapper class,
+a reducer class and a combiner class, and a set of built-in value aggregators.
+It also provides a generic utility class, ValueAggregatorJob, that offers a static function that
+creates map/reduce jobs:
+
+To call this function, the user needs to pass in arguments specifying the input directories, the output directory,
+the number of reducers, the input data format (textinputformat or sequencefileinputformat), and a file specifying user plugin class(es) to load by the mapper.
+A user plugin class is responsible for specifying what
+aggregators to use and what values are for which aggregators.
+A plugin class must implement the following interface:
+
+
+ public interface ValueAggregatorDescriptor {
+ public ArrayList<Entry> generateKeyValPairs(Object key, Object value);
+ public void configure(JobConfjob);
+}
+
+
+Function generateKeyValPairs will generate aggregation key/value pairs for the
+input key/value pair. Each aggregation key encodes two pieces of information: the aggregation type and aggregation ID.
+The value is the value to be aggregated onto the aggregation ID according to the aggregation type. Here
+is a simple example user plugin class for counting the words in the input texts:
+
+
+public class WordCountAggregatorDescriptor extends ValueAggregatorBaseDescriptor {
+ public ArrayList<Entry> generateKeyValPairs(Object key, Object val) {
+ String words [] = val.toString().split(" |\t");
+ ArrayList<Entry> retv = new ArrayList<Entry>();
+ for (int i = 0; i < words.length; i++) {
+ retv.add(generateEntry(LONG_VALUE_SUM, words[i], ONE))
+ }
+ return retv;
+ }
+ public void configure(JobConf job) {}
+}
+
+
+In the above code, LONG_VALUE_SUM is a string denoting the aggregation type LongValueSum, which sums over long values.
+ONE denotes a string "1". Function generateEntry(LONG_VALUE_SUM, words[i], ONE) will inperpret the first argument as an aggregation type, the second as an aggregation ID, and the third argumnent as the value to be aggregated. The output will look like: "LongValueSum:xxxx", where XXXX is the string value of words[i]. The value will be "1". The mapper will call generateKeyValPairs(Object key, Object val) for each input key/value pair to generate the desired aggregation id/value pairs.
+The down stream combiner/reducer will interpret these pairs as adding one to the aggregator XXXX.
+
+Class ValueAggregatorBaseDescriptor is a base class that user plugin classes can extend. Here is the XML fragment specifying the user plugin class:
+
+Thus, if no user plugin class is specified, the default behavior of the map/reduce job is to count the number of records (lines) in the imput files.
+
+During runtime, the mapper will invoke the generateKeyValPairs function for each input key/value pair, and emit the generated
+key/value pairs:
+
+The reducer will create an aggregator object for each key/value list pair, and perform the appropriate aggregation.
+At the end, it will emit the aggregator's results:
+
+In order to be able to use combiner, all the aggregation type be aggregators must be associative and communitive.
+The following are the types supported:
+
LongValueSum: sum over long values
+
DoubleValueSum: sum over float/double values
+
uniqValueCount: count the number of distinct values
+
ValueHistogram: compute the histogram of values compute the minimum, maximum, media,average, standard deviation of numeric values
+
+
+
Create and run an application
+
+To create an application, the user needs to do the following things:
+
+1. Implement a user plugin:
+
+
+The application programs link against a thin C++ wrapper library that
+handles the communication with the rest of the Hadoop system. The C++
+interface is "swigable" so that interfaces can be generated for python
+and other scripting languages. All of the C++ functions and classes
+are in the HadoopPipes namespace. The job may consist of any
+combination of Java and C++ RecordReaders, Mappers, Paritioner,
+Combiner, Reducer, and RecordWriter.
+
+
+
+Hadoop Pipes has a generic Java class for handling the mapper and
+reducer (PipesMapRunner and PipesReducer). They fork off the
+application program and communicate with it over a socket. The
+communication is handled by the C++ wrapper library and the
+PipesMapRunner and PipesReducer.
+
+
+
+The application program passes in a factory object that can create
+the various objects needed by the framework to the runTask
+function. The framework creates the Mapper or Reducer as
+appropriate and calls the map or reduce method to invoke the
+application's code. The JobConf is available to the application.
+
+
+
+The Mapper and Reducer objects get all of their inputs, outputs, and
+context via context objects. The advantage of using the context
+objects is that their interface can be extended with additional
+methods without breaking clients. Although this interface is different
+from the current Java interface, the plan is to migrate the Java
+interface in this direction.
+
+
+
+Although the Java implementation is typed, the C++ interfaces of keys
+and values is just a byte buffer. Since STL strings provide precisely
+the right functionality and are standard, they will be used. The
+decision to not use stronger types was to simplify the interface.
+
+
+
+The application can also define combiner functions. The combiner will
+be run locally by the framework in the application process to avoid
+the round trip to the Java process and back. Because the compare
+function is not available in C++, the combiner will use memcmp to
+sort the inputs to the combiner. This is not as general as the Java
+equivalent, which uses the user's comparator, but should cover the
+majority of the use cases. As the map function outputs key/value
+pairs, they will be buffered. When the buffer is full, it will be
+sorted and passed to the combiner. The output of the combiner will be
+sent to the Java process.
+
+
+
+The application can also set a partition function to control which key
+is given to a particular reduce. If a partition function is not
+defined, the Java one will be used. The partition function will be
+called by the C++ framework before the key/value pair is sent back to
+Java.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+The API is abstract so that it can be implemented on top of
+a variety of metrics client libraries. The choice of
+client library is a configuration option, and different
+modules within the same application can use
+different metrics implementation libraries.
+
+Sub-packages:
+
+
org.apache.hadoop.metrics.spi
+
The abstract Server Provider Interface package. Those wishing to
+ integrate the metrics API with a particular metrics client library should
+ extend this package.
+
+
org.apache.hadoop.metrics.file
+
An implementation package which writes the metric data to
+ a file, or sends it to the standard output stream.
+
+
org.apache.hadoop.metrics.ganglia
+
An implementation package which sends metric data to
+ Ganglia.
+
+
+
Introduction to the Metrics API
+
+Here is a simple example of how to use this package to report a single
+metric value:
+
The context name will typically identify either the application, or else a
+ module within an application or library.
+
+
myRecord
+
The record name generally identifies some entity for which a set of
+ metrics are to be reported. For example, you could have a record named
+ "cacheStats" for reporting a number of statistics relating to the usage of
+ some cache in your application.
+
+
myMetric
+
This identifies a particular metric. For example, you might have metrics
+ named "cache_hits" and "cache_misses".
+
+
+
+
Tags
+
+In some cases it is useful to have multiple records with the same name. For
+example, suppose that you want to report statistics about each disk on a computer.
+In this case, the record name would be something like "diskStats", but you also
+need to identify the disk which is done by adding a tag to the record.
+The code could look something like this:
+
+
+Data is not sent immediately to the metrics system when
+MetricsRecord.update() is called. Instead it is stored in an
+internal table, and the contents of the table are sent periodically.
+This can be important for two reasons:
+
+
It means that a programmer is free to put calls to this API in an
+ inner loop, since updates can be very frequent without slowing down
+ the application significantly.
+
Some implementations can gain efficiency by combining many metrics
+ into a single UDP message.
+
+
+The API provides a timer-based callback via the
+registerUpdater() method. The benefit of this
+versus using java.util.Timer is that the callbacks will be done
+immediately before sending the data, making the data as current as possible.
+
+
Configuration
+
+It is possible to programmatically examine and modify configuration data
+before creating a context, like this:
+
+The factory attributes can be examined and modified using the following
+ContextFactorymethods:
+
+
Object getAttribute(String attributeName)
+
String[] getAttributeNames()
+
void setAttribute(String name, Object value)
+
void removeAttribute(attributeName)
+
+
+
+ContextFactory.getFactory() initializes the factory attributes by
+reading the properties file hadoop-metrics.properties if it exists
+on the class path.
+
+
+A factory attribute named:
+
+contextName.class
+
+should have as its value the fully qualified name of the class to be
+instantiated by a call of the CodeFactory method
+getContext(contextName). If this factory attribute is not
+specified, the default is to instantiate
+org.apache.hadoop.metrics.file.FileContext.
+
+
+Other factory attributes are specific to a particular implementation of this
+API and are documented elsewhere. For example, configuration attributes for
+the file and Ganglia implementations can be found in the javadoc for
+their respective packages.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+These are the implementation specific factory attributes
+(See ContextFactory.getFactory()):
+
+
+
contextName.fileName
+
The path of the file to which metrics in context contextName
+ are to be appended. If this attribute is not specified, the metrics
+ are written to standard output by default.
+
+
contextName.period
+
The period in seconds on which the metric data is written to the
+ file.
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Implementation of the metrics package that sends metric data to
+Ganglia.
+Programmers should not normally need to use this package directly. Instead
+they should use org.hadoop.metrics.
+
+
+These are the implementation specific factory attributes
+(See ContextFactory.getFactory()):
+
+
+
contextName.servers
+
Space and/or comma separated sequence of servers to which UDP
+ messages should be sent.
+
+
contextName.period
+
The period in seconds on which the metric data is sent to the
+ server(s).
+
+
contextName.units.recordName.metricName
+
The units for the specified metric in the specified record.
+
+
contextName.slope.recordName.metricName
+
The slope for the specified metric in the specified record.
+
+
contextName.tmax.recordName.metricName
+
The tmax for the specified metric in the specified record.
+
+
contextName.dmax.recordName.metricName
+
The dmax for the specified metric in the specified record.
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+ org.apache.hadoop.metrics.file and
+org.apache.hadoop.metrics.ganglia.
+
+Plugging in an implementation involves writing a concrete subclass of
+AbstractMetricsContext. The subclass should get its
+ configuration information using the getAttribute(attributeName)
+ method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see #getOutputStream(Socket, long)
+
+ @param socket
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ node
+
+ @param node
+ a node
+ @return true if node is already in the tree; false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ scope
+ if scope starts with ~, choose one from the all nodes except for the
+ ones in scope; otherwise, choose one from scope
+ @param scope range of nodes from which a node will be choosen
+ @return the choosen node]]>
+
+
+
+
+
+
+ scope but not in excludedNodes
+ if scope starts with ~, return the number of nodes that are not
+ in scope and excludedNodes;
+ @param scope a path string that may start with ~
+ @param excludedNodes a list of nodes
+ @return number of available nodes]]>
+
+
+
+
+
+
+
+
+
+
+
+ reader
+ It linearly scans the array, if a local node is found, swap it with
+ the first element of the array.
+ If a local rack node is found, swap it with the first element following
+ the local node.
+ If neither local node or local rack node is found, put a random replica
+ location at postion 0.
+ It leaves the rest nodes untouched.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new ouput stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ = getCount().
+ @param newCapacity The new capacity in bytes.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Index idx = startVector(...);
+ while (!idx.done()) {
+ .... // read element of a vector
+ idx.incr();
+ }
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Introduction
+
+ Software systems of any significant complexity require mechanisms for data
+interchange with the outside world. These interchanges typically involve the
+marshaling and unmarshaling of logical units of data to and from data streams
+(files, network connections, memory buffers etc.). Applications usually have
+some code for serializing and deserializing the data types that they manipulate
+embedded in them. The work of serialization has several features that make
+automatic code generation for it worthwhile. Given a particular output encoding
+(binary, XML, etc.), serialization of primitive types and simple compositions
+of primitives (structs, vectors etc.) is a very mechanical task. Manually
+written serialization code can be susceptible to bugs especially when records
+have a large number of fields or a record definition changes between software
+versions. Lastly, it can be very useful for applications written in different
+programming languages to be able to share and interchange data. This can be
+made a lot easier by describing the data records manipulated by these
+applications in a language agnostic manner and using the descriptions to derive
+implementations of serialization in multiple target languages.
+
+This document describes Hadoop Record I/O, a mechanism that is aimed
+at
+
+
enabling the specification of simple serializable data types (records)
+
enabling the generation of code in multiple target languages for
+marshaling and unmarshaling such types
+
providing target language specific support that will enable application
+programmers to incorporate generated code into their applications
+
+
+The goals of Hadoop Record I/O are similar to those of mechanisms such as XDR,
+ASN.1, PADS and ICE. While these systems all include a DDL that enables
+the specification of most record types, they differ widely in what else they
+focus on. The focus in Hadoop Record I/O is on data marshaling and
+multi-lingual support. We take a translator-based approach to serialization.
+Hadoop users have to describe their data in a simple data description
+language. The Hadoop DDL translator rcc generates code that users
+can invoke in order to read/write their data from/to simple stream
+abstractions. Next we list explicitly some of the goals and non-goals of
+Hadoop Record I/O.
+
+
+
Goals
+
+
+
Support for commonly used primitive types. Hadoop should include as
+primitives commonly used builtin types from programming languages we intend to
+support.
+
+
Support for common data compositions (including recursive compositions).
+Hadoop should support widely used composite types such as structs and
+vectors.
+
+
Code generation in multiple target languages. Hadoop should be capable of
+generating serialization code in multiple target languages and should be
+easily extensible to new target languages. The initial target languages are
+C++ and Java.
+
+
Support for generated target languages. Hadooop should include support
+in the form of headers, libraries, packages for supported target languages
+that enable easy inclusion and use of generated code in applications.
+
+
Support for multiple output encodings. Candidates include
+packed binary, comma-separated text, XML etc.
+
+
Support for specifying record types in a backwards/forwards compatible
+manner. This will probably be in the form of support for optional fields in
+records. This version of the document does not include a description of the
+planned mechanism, we intend to include it in the next iteration.
+
+
+
+
Non-Goals
+
+
+
Serializing existing arbitrary C++ classes.
+
Serializing complex data structures such as trees, linked lists etc.
+
Built-in indexing schemes, compression, or check-sums.
+
Dynamic construction of objects from an XML schema.
+
+
+The remainder of this document describes the features of Hadoop record I/O
+in more detail. Section 2 describes the data types supported by the system.
+Section 3 lays out the DDL syntax with some examples of simple records.
+Section 4 describes the process of code generation with rcc. Section 5
+describes target language mappings and support for Hadoop types. We include a
+fairly complete description of C++ mappings with intent to include Java and
+others in upcoming iterations of this document. The last section talks about
+supported output encodings.
+
+
+
Data Types and Streams
+
+This section describes the primitive and composite types supported by Hadoop.
+We aim to support a set of types that can be used to simply and efficiently
+express a wide range of record types in different programming languages.
+
+
Primitive Types
+
+For the most part, the primitive types of Hadoop map directly to primitive
+types in high level programming languages. Special cases are the
+ustring (a Unicode string) and buffer types, which we believe
+find wide use and which are usually implemented in library code and not
+available as language built-ins. Hadoop also supplies these via library code
+when a target language built-in is not present and there is no widely
+adopted "standard" implementation. The complete list of primitive types is:
+
+
+
byte: An 8-bit unsigned integer.
+
boolean: A boolean value.
+
int: A 32-bit signed integer.
+
long: A 64-bit signed integer.
+
float: A single precision floating point number as described by
+ IEEE-754.
+
double: A double precision floating point number as described by
+ IEEE-754.
+
ustring: A string consisting of Unicode characters.
+
buffer: An arbitrary sequence of bytes.
+
+
+
+
Composite Types
+Hadoop supports a small set of composite types that enable the description
+of simple aggregate types and containers. A composite type is serialized
+by sequentially serializing it constituent elements. The supported
+composite types are:
+
+
+
+
record: An aggregate type like a C-struct. This is a list of
+typed fields that are together considered a single unit of data. A record
+is serialized by sequentially serializing its constituent fields. In addition
+to serialization a record has comparison operations (equality and less-than)
+implemented for it, these are defined as memberwise comparisons.
+
+
vector: A sequence of entries of the same data type, primitive
+or composite.
+
+
map: An associative container mapping instances of a key type to
+instances of a value type. The key and value types may themselves be primitive
+or composite types.
+
+
+
+
Streams
+
+Hadoop generates code for serializing and deserializing record types to
+abstract streams. For each target language Hadoop defines very simple input
+and output stream interfaces. Application writers can usually develop
+concrete implementations of these by putting a one method wrapper around
+an existing stream implementation.
+
+
+
DDL Syntax and Examples
+
+We now describe the syntax of the Hadoop data description language. This is
+followed by a few examples of DDL usage.
+
+
+
+A DDL file describes one or more record types. It begins with zero or
+more include declarations, a single mandatory module declaration
+followed by zero or more class declarations. The semantics of each of
+these declarations are described below:
+
+
+
+
include: An include declaration specifies a DDL file to be
+referenced when generating code for types in the current DDL file. Record types
+in the current compilation unit may refer to types in all included files.
+File inclusion is recursive. An include does not trigger code
+generation for the referenced file.
+
+
module: Every Hadoop DDL file must have a single module
+declaration that follows the list of includes and precedes all record
+declarations. A module declaration identifies a scope within which
+the names of all types in the current file are visible. Module names are
+mapped to C++ namespaces, Java packages etc. in generated code.
+
+
class: Records types are specified through class
+declarations. A class declaration is like a Java class declaration.
+It specifies a named record type and a list of fields that constitute records
+of the type. Usage is illustrated in the following examples.
+
+
+
+
Examples
+
+
+
A simple DDL file links.jr with just one record declaration.
+
+module links {
+ class Link {
+ ustring URL;
+ boolean isRelative;
+ ustring anchorText;
+ };
+}
+
+
+The Hadoop translator is written in Java. Invocation is done by executing a
+wrapper shell script named named rcc. It takes a list of
+record description files as a mandatory argument and an
+optional language argument (the default is Java) --language or
+-l. Thus a typical invocation would look like:
+
+$ rcc -l C++ ...
+
+
+
+
Target Language Mappings and Support
+
+For all target languages, the unit of code generation is a record type.
+For each record type, Hadoop generates code for serialization and
+deserialization, record comparison and access to record members.
+
+
C++
+
+Support for including Hadoop generated C++ code in applications comes in the
+form of a header file recordio.hh which needs to be included in source
+that uses Hadoop types and a library librecordio.a which applications need
+to be linked with. The header declares the Hadoop C++ namespace which defines
+appropriate types for the various primitives, the basic interfaces for
+records and streams and enumerates the supported serialization encodings.
+Declarations of these interfaces and a description of their semantics follow:
+
+
RecFormat: An enumeration of the serialization encodings supported
+by this implementation of Hadoop.
+
+
InStream: A simple abstraction for an input stream. This has a
+single public read method that reads n bytes from the stream into
+the buffer buf. Has the same semantics as a blocking read system
+call. Returns the number of bytes read or -1 if an error occurs.
+
+
OutStream: A simple abstraction for an output stream. This has a
+single write method that writes n bytes to the stream from the
+buffer buf. Has the same semantics as a blocking write system
+call. Returns the number of bytes written or -1 if an error occurs.
+
+
RecordReader: A RecordReader reads records one at a time from
+an underlying stream in a specified record format. The reader is instantiated
+with a stream and a serialization format. It has a read method that
+takes an instance of a record and deserializes the record from the stream.
+
+
RecordWriter: A RecordWriter writes records one at a
+time to an underlying stream in a specified record format. The writer is
+instantiated with a stream and a serialization format. It has a
+write method that takes an instance of a record and serializes the
+record to the stream.
+
+
Record: The base class for all generated record types. This has two
+public methods type and signature that return the typename and the
+type signature of the record.
+
+
+
+Two files are generated for each record file (note: not for each record). If a
+record file is named "name.jr", the generated files are
+"name.jr.cc" and "name.jr.hh" containing serialization
+implementations and record type declarations respectively.
+
+For each record in the DDL file, the generated header file will contain a
+class definition corresponding to the record type, method definitions for the
+generated type will be present in the '.cc' file. The generated class will
+inherit from the abstract class hadoop::Record. The DDL files
+module declaration determines the namespace the record belongs to.
+Each '.' delimited token in the module declaration results in the
+creation of a namespace. For instance, the declaration module docs.links
+results in the creation of a docs namespace and a nested
+docs::links namespace. In the preceding examples, the Link class
+is placed in the links namespace. The header file corresponding to
+the links.jr file will contain:
+
+
+namespace links {
+ class Link : public hadoop::Record {
+ // ....
+ };
+};
+
+
+Each field within the record will cause the generation of a private member
+declaration of the appropriate type in the class declaration, and one or more
+acccessor methods. The generated class will implement the serialize and
+deserialize methods defined in hadoop::Record+. It will also
+implement the inspection methods type and signature from
+hadoop::Record. A default constructor and virtual destructor will also
+be generated. Serialization code will read/write records into streams that
+implement the hadoop::InStream and the hadoop::OutStream interfaces.
+
+For each member of a record an accessor method is generated that returns
+either the member or a reference to the member. For members that are returned
+by value, a setter method is also generated. This is true for primitive
+data members of the types byte, int, long, boolean, float and
+double. For example, for a int field called MyField the folowing
+code is generated.
+
+
+
+For a ustring or buffer or composite field. The generated code
+only contains accessors that return a reference to the field. A const
+and a non-const accessor are generated. For example:
+
+
+
+Code generation for Java is similar to that for C++. A Java class is generated
+for each record type with private members corresponding to the fields. Getters
+and setters for fields are also generated. Some differences arise in the
+way comparison is expressed and in the mapping of modules to packages and
+classes to files. For equality testing, an equals method is generated
+for each record type. As per Java requirements a hashCode method is also
+generated. For comparison a compareTo method is generated for each
+record type. This has the semantics as defined by the Java Comparable
+interface, that is, the method returns a negative integer, zero, or a positive
+integer as the invoked object is less than, equal to, or greater than the
+comparison parameter.
+
+A .java file is generated per record type as opposed to per DDL
+file as in C++. The module declaration translates to a Java
+package declaration. The module name maps to an identical Java package
+name. In addition to this mapping, the DDL compiler creates the appropriate
+directory hierarchy for the package and places the generated .java
+files in the correct directories.
+
+
Mapping Summary
+
+
+DDL Type C++ Type Java Type
+
+boolean bool boolean
+byte int8_t byte
+int int32_t int
+long int64_t long
+float float float
+double double double
+ustring std::string java.lang.String
+buffer std::string org.apache.hadoop.record.Buffer
+class type class type class type
+vector std::vector java.util.ArrayList
+map std::map java.util.TreeMap
+
+
+
Data encodings
+
+This section describes the format of the data encodings supported by Hadoop.
+Currently, three data encodings are supported, namely binary, CSV and XML.
+
+
Binary Serialization Format
+
+The binary data encoding format is fairly dense. Serialization of composite
+types is simply defined as a concatenation of serializations of the constituent
+elements (lengths are included in vectors and maps).
+
+Composite types are serialized as follows:
+
+
class: Sequence of serialized members.
+
vector: The number of elements serialized as an int. Followed by a
+sequence of serialized elements.
+
map: The number of key value pairs serialized as an int. Followed
+by a sequence of serialized (key,value) pairs.
+
+
+Serialization of primitives is more interesting, with a zero compression
+optimization for integral types and normalization to UTF-8 for strings.
+Primitive types are serialized as follows:
+
+
+
byte: Represented by 1 byte, as is.
+
boolean: Represented by 1-byte (0 or 1)
+
int/long: Integers and longs are serialized zero compressed.
+Represented as 1-byte if -120 <= value < 128. Otherwise, serialized as a
+sequence of 2-5 bytes for ints, 2-9 bytes for longs. The first byte represents
+the number of trailing bytes, N, as the negative number (-120-N). For example,
+the number 1024 (0x400) is represented by the byte sequence 'x86 x04 x00'.
+This doesn't help much for 4-byte integers but does a reasonably good job with
+longs without bit twiddling.
+
float/double: Serialized in IEEE 754 single and double precision
+format in network byte order. This is the format used by Java.
+
ustring: Serialized as 4-byte zero compressed length followed by
+data encoded as UTF-8. Strings are normalized to UTF-8 regardless of native
+language representation.
+
buffer: Serialized as a 4-byte zero compressed length followed by the
+raw bytes in the buffer.
+
+
+
+
CSV Serialization Format
+
+The CSV serialization format has a lot more structure than the "standard"
+Excel CSV format, but we believe the additional structure is useful because
+
+
+
it makes parsing a lot easier without detracting too much from legibility
+
the delimiters around composites make it obvious when one is reading a
+sequence of Hadoop records
+
+
+Serialization formats for the various types are detailed in the grammar that
+follows. The notable feature of the formats is the use of delimiters for
+indicating the certain field types.
+
+
+
A string field begins with a single quote (').
+
A buffer field begins with a sharp (#).
+
A class, vector or map begins with 's{', 'v{' or 'm{' respectively and
+ends with '}'.
+
+
+The CSV format can be described by the following grammar:
+
+
+
+The XML serialization format is the same used by Apache XML-RPC
+(http://ws.apache.org/xmlrpc/types.html). This is an extension of the original
+XML-RPC format and adds some additional data types. All record I/O types are
+not directly expressible in this format, and access to a DDL is required in
+order to convert these to valid types. All types primitive or composite are
+represented by <value> elements. The particular XML-RPC type is
+indicated by a nested element in the <value> element. The encoding for
+records is always UTF-8. Primitive types are serialized as follows:
+
+
+
byte: XML tag <ex:i1>. Values: 1-byte unsigned
+integers represented in US-ASCII
+
boolean: XML tag <boolean>. Values: "0" or "1"
+
int: XML tags <i4> or <int>. Values: 4-byte
+signed integers represented in US-ASCII.
+
long: XML tag <ex:i8>. Values: 8-byte signed integers
+represented in US-ASCII.
+
float: XML tag <ex:float>. Values: Single precision
+floating point numbers represented in US-ASCII.
+
double: XML tag <double>. Values: Double precision
+floating point numbers represented in US-ASCII.
+
ustring: XML tag <;string>. Values: String values
+represented as UTF-8. XML does not permit all Unicode characters in literal
+data. In particular, NULLs and control chars are not allowed. Additionally,
+XML processors are required to replace carriage returns with line feeds and to
+replace CRLF sequences with line feeds. Programming languages that we work
+with do not impose these restrictions on string types. To work around these
+restrictions, disallowed characters and CRs are percent escaped in strings.
+The '%' character is also percent escaped.
+
buffer: XML tag <string&>. Values: Arbitrary binary
+data. Represented as hexBinary, each byte is replaced by its 2-byte
+hexadecimal representation.
+
+
+Composite types are serialized as follows:
+
+
+
class: XML tag <struct>. A struct is a sequence of
+<member> elements. Each <member> element has a <name>
+element and a <value> element. The <name> is a string that must
+match /[a-zA-Z][a-zA-Z0-9_]*/. The value of the member is represented
+by a <value> element.
+
+
vector: XML tag <array<. An <array> contains a
+single <data> element. The <data> element is a sequence of
+<value> elements each of which represents an element of the vector.
+
+
map: XML tag <array>. Same as vector.
+
+
+
+For example:
+
+
+class {
+ int MY_INT; // value 5
+ vector MY_VEC; // values 0.1, -0.89, 2.45e4
+ buffer MY_BUF; // value '\00\n\tabc%'
+}
+
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/jdiff/hadoop_0.18.1.xml b/lib/jdiff/hadoop_0.18.1.xml
new file mode 100644
index 00000000000..fd844cbed0f
--- /dev/null
+++ b/lib/jdiff/hadoop_0.18.1.xml
@@ -0,0 +1,44778 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The balancer is a tool that balances disk space usage on an HDFS cluster
+ when some datanodes become full or when new empty nodes join the cluster.
+ The tool is deployed as an application program that can be run by the
+ cluster administrator on a live HDFS cluster while applications
+ adding and deleting files.
+
+
SYNOPSIS
+
+ To start:
+ bin/start-balancer.sh [-threshold ]
+ Example: bin/ start-balancer.sh
+ start the balancer with a default threshold of 10%
+ bin/ start-balancer.sh -threshold 5
+ start the balancer with a threshold of 5%
+ To stop:
+ bin/ stop-balancer.sh
+
+
+
DESCRIPTION
+
The threshold parameter is a fraction in the range of (0%, 100%) with a
+ default value of 10%. The threshold sets a target for whether the cluster
+ is balanced. A cluster is balanced if for each datanode, the utilization
+ of the node (ratio of used space at the node to total capacity of the node)
+ differs from the utilization of the (ratio of used space in the cluster
+ to total capacity of the cluster) by no more than the threshold value.
+ The smaller the threshold, the more balanced a cluster will become.
+ It takes more time to run the balancer for small threshold values.
+ Also for a very small threshold the cluster may not be able to reach the
+ balanced state when applications write and delete files concurrently.
+
+
The tool moves blocks from highly utilized datanodes to poorly
+ utilized datanodes iteratively. In each iteration a datanode moves or
+ receives no more than the lesser of 10G bytes or the threshold fraction
+ of its capacity. Each iteration runs no more than 20 minutes.
+ At the end of each iteration, the balancer obtains updated datanodes
+ information from the namenode.
+
+
A system property that limits the balancer's use of bandwidth is
+ defined in the default configuration file:
+
+
+ dfs.balance.bandwidthPerSec
+ 1048576
+ Specifies the maximum bandwidth that each datanode
+ can utilize for the balancing purpose in term of the number of bytes
+ per second.
+
+
+
+
This property determines the maximum speed at which a block will be
+ moved from one datanode to another. The default value is 1MB/s. The higher
+ the bandwidth, the faster a cluster can reach the balanced state,
+ but with greater competition with application processes. If an
+ administrator changes the value of this property in the configuration
+ file, the change is observed when HDFS is next restarted.
+
+
MONITERING BALANCER PROGRESS
+
After the balancer is started, an output file name where the balancer
+ progress will be recorded is printed on the screen. The administrator
+ can monitor the running of the balancer by reading the output file.
+ The output shows the balancer's status iteration by iteration. In each
+ iteration it prints the starting time, the iteration number, the total
+ number of bytes that have been moved in the previous iterations,
+ the total number of bytes that are left to move in order for the cluster
+ to be balanced, and the number of bytes that are being moved in this
+ iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left
+ To Move" is decreasing.
+
+
Running multiple instances of the balancer in an HDFS cluster is
+ prohibited by the tool.
+
+
The balancer automatically exits when any of the following five
+ conditions is satisfied:
+
+
The cluster is balanced;
+
No block can be moved;
+
No block has been moved for five consecutive iterations;
+
An IOException occurs while communicating with the namenode;
+
Another balancer is running.
+
+
+
Upon exit, a balancer returns an exit code and prints one of the
+ following messages to the output file in corresponding to the above exit
+ reasons:
+
+
The cluster is balanced. Exiting
+
No block can be moved. Exiting...
+
No block has been moved for 3 iterations. Exiting...
+
Received an IO exception: failure reason. Exiting...
+
Another balancer is running. Exiting...
+
+
+
The administrator can interrupt the execution of the balancer at any
+ time by running the command "stop-balancer.sh" on the machine where the
+ balancer is running.]]>
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link NamenodeFsck#FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link NamenodeFsck#FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
{@link FSConstants.StartupOption#REGULAR REGULAR} - normal startup
+
{@link FSConstants.StartupOption#FORMAT FORMAT} - format name node
+
{@link FSConstants.StartupOption#UPGRADE UPGRADE} - start the cluster
+ upgrade and create a snapshot of the current file system state
+
{@link FSConstants.StartupOption#ROLLBACK ROLLBACK} - roll the
+ cluster back to the previous state
+
+ The option is passed via configuration field:
+ dfs.namenode.startup
+
+ The conf will be modified to reflect the actual ports on which
+ the NameNode is up and running if the user passes the port as
+ zero in the conf.
+
+ @param conf confirguration
+ @throws IOException]]>
+
+
+
+
+
+ zero.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ datanode whose
+ total size is size
+
+ @param datanode on which blocks are located
+ @param size total size of blocks]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ blocksequence (namespace)
+ 2) block->machinelist ("inodes")
+
+ The first table is stored on disk and is very precious.
+ The second table is rebuilt every time the NameNode comes
+ up.
+
+ 'NameNode' refers to both this class as well as the 'NameNode server'.
+ The 'FSNamesystem' class actually performs most of the filesystem
+ management. The majority of the 'NameNode' class itself is concerned
+ with exposing the IPC interface to the outside world, plus some
+ configuration management.
+
+ NameNode implements the ClientProtocol interface, which allows
+ clients to ask for DFS services. ClientProtocol is not
+ designed for direct use by authors of DFS client code. End-users
+ should instead use the org.apache.nutch.hadoop.fs.FileSystem class.
+
+ NameNode also implements the DatanodeProtocol interface, used by
+ DataNode programs that actually store DFS data blocks. These
+ methods are invoked repeatedly and automatically by all the
+ DataNodes in a DFS deployment.
+
+ NameNode also implements the NamenodeProtocol interface, used by
+ secondary namenodes or rebalancing processes to get partial namenode's
+ state, for example partial blocksMap etc.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The tool scans all files and directories, starting from an indicated
+ root path. The following abnormal conditions are detected and handled:
+
+
files with blocks that are completely missing from all datanodes.
+ In this case the tool can perform one of the following actions:
+
+
none ({@link #FIXING_NONE})
+
move corrupted files to /lost+found directory on DFS
+ ({@link #FIXING_MOVE}). Remaining data blocks are saved as a
+ block chains, representing longest consecutive series of valid blocks.
+
delete corrupted files ({@link #FIXING_DELETE})
+
+
+
detect files with under-replicated or over-replicated blocks
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input
+ @deprecated getSplits is called in the client and can perform any
+ necessary validation of the input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .
+ @param name The name of the server
+ @param port The port to use on the server
+ @param findPort whether the server should start at the given port and
+ increment by 1 until it finds a free port.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ points to the log directory
+ "/static/" -> points to common static files (src/webapps/static)
+ "/" -> the jsp server code from (src/webapps/)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ pairs. Uses
+ {@link StringTokenizer} to break text into tokens.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ generateKeyValPairs(Object key, Object value); public void
+ configure(JobConfjob); }
+
+ The package also provides a base class, ValueAggregatorBaseDescriptor,
+ implementing the above interface. The user can extend the base class and
+ implement generateKeyValPairs accordingly.
+
+ The primary work of generateKeyValPairs is to emit one or more key/value
+ pairs based on the input key/value pair. The key in an output key/value pair
+ encode two pieces of information: aggregation type and aggregation id. The
+ value will be aggregated onto the aggregation id according the aggregation
+ type.
+
+ This class offers a function to generate a map/reduce job using Aggregate
+ framework. The function takes the following parameters: input directory spec
+ input format (text or sequence file) output directory a file specifying the
+ user plugin class]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/jdiff/hadoop_0.18.2.xml b/lib/jdiff/hadoop_0.18.2.xml
new file mode 100644
index 00000000000..08173ab82dc
--- /dev/null
+++ b/lib/jdiff/hadoop_0.18.2.xml
@@ -0,0 +1,38788 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input
+ @deprecated getSplits is called in the client and can perform any
+ necessary validation of the input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .
+ @param name The name of the server
+ @param port The port to use on the server
+ @param findPort whether the server should start at the given port and
+ increment by 1 until it finds a free port.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ points to the log directory
+ "/static/" -> points to common static files (src/webapps/static)
+ "/" -> the jsp server code from (src/webapps/)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ pairs. Uses
+ {@link StringTokenizer} to break text into tokens.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ generateKeyValPairs(Object key, Object value); public void
+ configure(JobConfjob); }
+
+ The package also provides a base class, ValueAggregatorBaseDescriptor,
+ implementing the above interface. The user can extend the base class and
+ implement generateKeyValPairs accordingly.
+
+ The primary work of generateKeyValPairs is to emit one or more key/value
+ pairs based on the input key/value pair. The key in an output key/value pair
+ encode two pieces of information: aggregation type and aggregation id. The
+ value will be aggregated onto the aggregation id according the aggregation
+ type.
+
+ This class offers a function to generate a map/reduce job using Aggregate
+ framework. The function takes the following parameters: input directory spec
+ input format (text or sequence file) output directory a file specifying the
+ user plugin class]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/jdiff/hadoop_0.18.3.xml b/lib/jdiff/hadoop_0.18.3.xml
new file mode 100644
index 00000000000..564916fef77
--- /dev/null
+++ b/lib/jdiff/hadoop_0.18.3.xml
@@ -0,0 +1,38826 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Hadoop by default specifies two resources, loaded in-order from the
+ classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link JobConf}. The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link Mapper} or {@link Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is {@link DistributedFileSystem}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This method is used to validate the input directories when a job is
+ submitted so that the {@link JobClient} can fail early, with an useful
+ error message, in case of errors. For e.g. input directory does not exist.
+
+
+ @param job job configuration.
+ @throws InvalidInputException if the job does not have valid input
+ @deprecated getSplits is called in the client and can perform any
+ necessary validation of the input]]>
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .
+ @param name The name of the server
+ @param port The port to use on the server
+ @param findPort whether the server should start at the given port and
+ increment by 1 until it finds a free port.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ points to the log directory
+ "/static/" -> points to common static files (src/webapps/static)
+ "/" -> the jsp server code from (src/webapps/)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ pairs. Uses
+ {@link StringTokenizer} to break text into tokens.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ generateKeyValPairs(Object key, Object value); public void
+ configure(JobConfjob); }
+
+ The package also provides a base class, ValueAggregatorBaseDescriptor,
+ implementing the above interface. The user can extend the base class and
+ implement generateKeyValPairs accordingly.
+
+ The primary work of generateKeyValPairs is to emit one or more key/value
+ pairs based on the input key/value pair. The key in an output key/value pair
+ encode two pieces of information: aggregation type and aggregation id. The
+ value will be aggregated onto the aggregation id according the aggregation
+ type.
+
+ This class offers a function to generate a map/reduce job using Aggregate
+ framework. The function takes the following parameters: input directory spec
+ input format (text or sequence file) output directory a file specifying the
+ user plugin class]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param job job configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/jdiff/hadoop_0.19.0.xml b/lib/jdiff/hadoop_0.19.0.xml
new file mode 100644
index 00000000000..557ac3cc598
--- /dev/null
+++ b/lib/jdiff/hadoop_0.19.0.xml
@@ -0,0 +1,43972 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param in InputStream to deserialize the object from.]]>
+
+
+
+
+
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property
+ as an array of Class.
+ The value of the property specifies a list of comma separated class names.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the property name.
+ @param defaultValue default value.
+ @return property value as a Class[],
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Unless explicitly turned off, Hadoop by default specifies two
+ resources, loaded in-order from the classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link org.apache.hadoop.mapred.JobConf}.
+ The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+ or {@link org.apache.hadoop.mapred.Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is DistributedFileSystem.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair compatible with lzop.
+ http://www.lzop.org/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FIXME: This array should be in a private or package private location,
+ since it could be modified by malicious code.
+ ]]>
+
+
+
+
+ This interface is public for historical purposes. You should have no need to
+ use it.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+ Although BZip2 headers are marked with the magic "Bz" this
+ constructor expects the next byte in the stream to be the first one after
+ the magic. Thus callers have to skip the first two bytes. Otherwise this
+ constructor will throw an exception.
+
+
+ @throws IOException
+ if the stream content is malformed or an I/O error occurs.
+ @throws NullPointerException
+ if in == null]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The decompression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2InputStream to release the allocated memory. See
+ {@link CBZip2OutputStream CBZip2OutputStream} for information about memory
+ usage.
+
+
+
+ CBZip2InputStream reads bytes from the compressed source stream via
+ the single byte {@link java.io.InputStream#read() read()} method exclusively.
+ Thus you should consider to use a buffered source stream.
+
+
+
+ Instances of this class are not threadsafe.
+
]]>
+
+
+
+
+
+
+
+
+
+ CBZip2OutputStream with a blocksize of 900k.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+ @param out *
+ the destination stream.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws NullPointerException
+ if out == null.]]>
+
+
+
+
+
+ CBZip2OutputStream with specified blocksize.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+
+ @param out
+ the destination stream.
+ @param blockSize
+ the blockSize as 100k units.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws IllegalArgumentException
+ if (blockSize < 1) || (blockSize > 9).
+ @throws NullPointerException
+ if out == null.
+
+ @see #MIN_BLOCKSIZE
+ @see #MAX_BLOCKSIZE]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ inputLength this method returns MAX_BLOCKSIZE
+ always.
+
+ @param inputLength
+ The length of the data which will be compressed by
+ CBZip2OutputStream.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ == 1.]]>
+
+
+
+
+ == 9.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If you are ever unlucky/improbable enough to get a stack overflow whilst
+ sorting, increase the following constant and try again. In practice I
+ have never seen the stack go above 27 elems, so the following limit seems
+ very generous.
+ ]]>
+
+
+
+
+ The compression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2OutputStream to release the allocated memory.
+
+
+
+ You can shrink the amount of allocated memory and maybe raise the compression
+ speed by choosing a lower blocksize, which in turn may cause a lower
+ compression ratio. You can avoid unnecessary memory allocation by avoiding
+ using a blocksize which is bigger than the size of the input.
+
+
+
+ You can compute the memory usage for compressing by the following formula:
+
+
+
+ <code>400k + (9 * blocksize)</code>.
+
+
+
+ To get the memory required for decompression by {@link CBZip2InputStream
+ CBZip2InputStream} use
+
+
+
+ <code>65k + (5 * blocksize)</code>.
+
+
+
+
+
+
+
Memory usage by blocksize
+
+
+
Blocksize
Compression
+ memory usage
Decompression
+ memory usage
+
+
+
100k
+
1300k
+
565k
+
+
+
200k
+
2200k
+
1065k
+
+
+
300k
+
3100k
+
1565k
+
+
+
400k
+
4000k
+
2065k
+
+
+
500k
+
4900k
+
2565k
+
+
+
600k
+
5800k
+
3065k
+
+
+
700k
+
6700k
+
3565k
+
+
+
800k
+
7600k
+
4065k
+
+
+
900k
+
8500k
+
4565k
+
+
+
+
+ For decompression CBZip2InputStream allocates less memory if the
+ bzipped input is smaller than one block.
+
+
+
+ Instances of this class are not threadsafe.
+
+
+
+ TODO: Update to BZip2 1.0.1
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Note: The following is valid only if the {@link OutputCommitter}
+ is {@link FileOutputCommitter}. If OutputCommitter is not
+ a FileOutputCommitter, the task's temporary output
+ directory is same as {@link #getOutputPath(JobConf)} i.e.
+ ${mapred.output.dir}$
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The generated name can be used to create custom files from within the
+ different tasks for the job, the names for different tasks will not collide
+ with each other.
+
+
The given name is postfixed with the task type, 'm' for maps, 'r' for
+ reduces and the task partition number. For example, give a name 'test'
+ running on the first map o the job the generated name will be
+ 'test-m-00000'.
+
+ @param conf the configuration for the job.
+ @param name the name to make unique.
+ @return a unique name accross all tasks of the job.]]>
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueName} method to make the file name
+ unique for the task.
+
+ @param conf the configuration for the job.
+ @param name the name for the file.
+ @return a unique path accross all tasks of the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
+
+
+ @see JobConf
+ @see ClusterStatus
+ @see Tool
+ @see DistributedCache]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If the parameter {@code loadDefaults} is false, the new instance
+ will not load resources from the default files.
+
+ @param loadDefaults specifies whether to load from the default files]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if framework should keep the intermediate files
+ for failed tasks, false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the outputs of the maps are to be compressed,
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This comparator should be provided if the equivalence rules for keys
+ for sorting the intermediates are different from those for grouping keys
+ before each call to
+ {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.
+
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's cleanup-tasks, as a float between 0.0
+ and 1.0. When all cleanup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's cleanup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's setup-tasks, as a float between 0.0
+ and 1.0. When all setup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's setup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop provides an optional mode of execution in which the bad records
+ are detected and skipped in further attempts.
+
+
This feature can be used when map/reduce tasks crashes deterministically on
+ certain input. This happens due to bugs in the map/reduce function. The usual
+ course would be to fix these bugs. But sometimes this is not possible;
+ perhaps the bug is in third party libraries for which the source code is
+ not available. Due to this, the task never reaches to completion even with
+ multiple attempts and complete data for that task is lost.
+
+
With this feature, only a small portion of data is lost surrounding
+ the bad record, which may be acceptable for some user applications.
+ see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}
+
+
The skipping mode gets kicked off after certain no of failures
+ see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}
+
+
In the skipping mode, the map/reduce task maintains the record range which
+ is getting processed at all times. Before giving the input to the
+ map/reduce function, it sends this record range to the Task tracker.
+ If task crashes, the Task tracker knows which one was the last reported
+ range. On further attempts that range get skipped.
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+ mapred.join.define.<ident> to a classname. In the expression
+ mapred.join.expr, the identifier will be assumed to be a
+ ComposableRecordReader.
+ mapred.join.keycomparator can be a classname used to compare keys
+ in the join.
+ @see JoinRecordReader
+ @see MultiFilterRecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ capacity children to position
+ id in the parent reader.
+ The id of a root CompositeRecordReader is -1 by convention, but relying
+ on this is not recommended.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ override(S1,S2,S3) will prefer values
+ from S3 over S2, and values from S2 over S1 for all keys
+ emitted from all sources.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ [,,...,]]]>
+
+
+
+
+
+
+ out.
+ TupleWritable format:
+ {@code
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+
+
+ @param job job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ The Mapper classes are invoked in a chained (or piped) fashion, the output of
+ the first becomes the input of the second, and so on until the last Mapper,
+ the output of the last Mapper will be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed in a chain. This enables having
+ reusable specialized Mappers that can be combined to perform composite
+ operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain.
+
+ ChainMapper usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Reducer leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Reducer does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Reducer the configuration given for it,
+ reducerConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ @param job job's JobConf to add the Reducer class.
+ @param klass the Reducer class to add.
+ @param inputKeyClass reducer input key class.
+ @param inputValueClass reducer input value class.
+ @param outputKeyClass reducer output key class.
+ @param outputValueClass reducer output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param reducerConf a JobConf with the configuration for the Reducer
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+ .
+
+ @param job chain job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ reduce(...) method of the Reducer with the
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ For each record output by the Reducer, the Mapper classes are invoked in a
+ chained (or piped) fashion, the output of the first becomes the input of the
+ second, and so on until the last Mapper, the output of the last Mapper will
+ be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed after the Reducer or in a chain.
+ This enables having reusable specialized Mappers that can be combined to
+ perform composite operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ ChainReducer usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ @param freq The frequency with which records will be emitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ This will read every split at the client, which is very expensive.
+ @param freq Probability with which a key will be chosen.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ Takes the first numSamples / numSplits records from each split.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the name output is multi, false
+ if it is single. If the name output is not defined it returns
+ false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ @param conf job conf to enableadd the named output.
+ @param enabled indicates if the counters will be enabled or not.]]>
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+
+ @param conf job conf to enableadd the named output.
+ @return TRUE if the counters are enabled, FALSE if they are disabled.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param multiName the multi name part
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+ If overriden subclasses must invoke super.close() at the
+ end of their close()
+
+ @throws java.io.IOException thrown if any of the MultipleOutput files
+ could not be closed properly.]]>
+
+
+
+ OutputCollector passed to
+ the map() and reduce() methods of the
+ Mapper and Reducer implementations.
+
+ Each additional output, or named output, may be configured with its own
+ OutputFormat, with its own key class and with its own value
+ class.
+
+ A named output can be a single file or a multi file. The later is refered as
+ a multi named output.
+
+ A multi named output is an unbound set of files all sharing the same
+ OutputFormat, key class and value class configuration.
+
+ When named outputs are used within a Mapper implementation,
+ key/values written to a name output are not part of the reduce phase, only
+ key/values written to the job OutputCollector are part of the
+ reduce phase.
+
+ MultipleOutputs supports counters, by default the are disabled. The counters
+ group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ Job configuration usage pattern is:
+
+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+ SequenceFileOutputFormat.class,
+ LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+
+
+ Job configuration usage pattern is:
+
+
+ public class MOReduce implements
+ Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+ Alternatively, the properties can be set in the configuration with proper
+ values.
+
+ @see DBConfiguration#configureDB(JobConf, String, String, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...)
+ @see DBOutputFormat#setOutput(JobConf, String, String...)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 20070101 AND length > 0)'
+ @param orderBy the fieldNames in the orderBy clause.
+ @param fieldNames The field names in the table
+ @see #setInput(JobConf, Class, String, String)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBInputFormat emits LongWritables containing the record number as
+ key and DBWritables as value.
+
+ The SQL query, and input class can be using one of the two
+ setInput methods.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {@link DBOutputFormat} accepts <key,value> pairs, where
+ key has a type extending DBWritable. Returned {@link RecordWriter}
+ writes only the key to the database with a batch SQL query.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBWritable. DBWritable, is similar to {@link Writable}
+ except that the {@link #write(PreparedStatement)} method takes a
+ {@link PreparedStatement}, and {@link #readFields(ResultSet)}
+ takes a {@link ResultSet}.
+
+ Implementations are responsible for writing the fields of the object
+ to PreparedStatement, and reading the fields of the object from the
+ ResultSet.
+
+
Example:
+ If we have the following table in the database :
+
+ CREATE TABLE MyTable (
+ counter INTEGER NOT NULL,
+ timestamp BIGINT NOT NULL,
+ );
+
+ then we can read/write the tuples from/to the table with :
+
+ public class MyWritable implements Writable, DBWritable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ //Writable#write() implementation
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ //Writable#readFields() implementation
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public void write(PreparedStatement statement) throws SQLException {
+ statement.setInt(1, counter);
+ statement.setLong(2, timestamp);
+ }
+
+ public void readFields(ResultSet resultSet) throws SQLException {
+ counter = resultSet.getInt(1);
+ timestamp = resultSet.getLong(2);
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ io.file.buffer.size specified in the given
+ Configuration.
+ @param in input stream
+ @param conf configuration
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param conf configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/jdiff/hadoop_0.19.1.xml b/lib/jdiff/hadoop_0.19.1.xml
new file mode 100644
index 00000000000..92bdd2c7996
--- /dev/null
+++ b/lib/jdiff/hadoop_0.19.1.xml
@@ -0,0 +1,44195 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param in InputStream to deserialize the object from.]]>
+
+
+
+
+
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property
+ as an array of Class.
+ The value of the property specifies a list of comma separated class names.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the property name.
+ @param defaultValue default value.
+ @return property value as a Class[],
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Unless explicitly turned off, Hadoop by default specifies two
+ resources, loaded in-order from the classpath:
hadoop-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link org.apache.hadoop.mapred.JobConf}.
+ The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+ or {@link org.apache.hadoop.mapred.Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is DistributedFileSystem.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-lzo library is loaded & initialized;
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair.
+ http://www.oberhumer.com/opensource/lzo/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ lzo compression/decompression pair compatible with lzop.
+ http://www.lzop.org/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FIXME: This array should be in a private or package private location,
+ since it could be modified by malicious code.
+ ]]>
+
+
+
+
+ This interface is public for historical purposes. You should have no need to
+ use it.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Although BZip2 headers are marked with the magic "Bz" this
+ constructor expects the next byte in the stream to be the first one after
+ the magic. Thus callers have to skip the first two bytes. Otherwise this
+ constructor will throw an exception.
+
+
+ @throws IOException
+ if the stream content is malformed or an I/O error occurs.
+ @throws NullPointerException
+ if in == null]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The decompression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2InputStream to release the allocated memory. See
+ {@link CBZip2OutputStream CBZip2OutputStream} for information about memory
+ usage.
+
+
+
+ CBZip2InputStream reads bytes from the compressed source stream via
+ the single byte {@link java.io.InputStream#read() read()} method exclusively.
+ Thus you should consider to use a buffered source stream.
+
+
+
+ Instances of this class are not threadsafe.
+
]]>
+
+
+
+
+
+
+
+
+
+ CBZip2OutputStream with a blocksize of 900k.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+ @param out *
+ the destination stream.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws NullPointerException
+ if out == null.]]>
+
+
+
+
+
+ CBZip2OutputStream with specified blocksize.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+
+ @param out
+ the destination stream.
+ @param blockSize
+ the blockSize as 100k units.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws IllegalArgumentException
+ if (blockSize < 1) || (blockSize > 9).
+ @throws NullPointerException
+ if out == null.
+
+ @see #MIN_BLOCKSIZE
+ @see #MAX_BLOCKSIZE]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ inputLength this method returns MAX_BLOCKSIZE
+ always.
+
+ @param inputLength
+ The length of the data which will be compressed by
+ CBZip2OutputStream.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ == 1.]]>
+
+
+
+
+ == 9.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If you are ever unlucky/improbable enough to get a stack overflow whilst
+ sorting, increase the following constant and try again. In practice I
+ have never seen the stack go above 27 elems, so the following limit seems
+ very generous.
+ ]]>
+
+
+
+
+ The compression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2OutputStream to release the allocated memory.
+
+
+
+ You can shrink the amount of allocated memory and maybe raise the compression
+ speed by choosing a lower blocksize, which in turn may cause a lower
+ compression ratio. You can avoid unnecessary memory allocation by avoiding
+ using a blocksize which is bigger than the size of the input.
+
+
+
+ You can compute the memory usage for compressing by the following formula:
+
+
+
+ <code>400k + (9 * blocksize)</code>.
+
+
+
+ To get the memory required for decompression by {@link CBZip2InputStream
+ CBZip2InputStream} use
+
+
+
+ <code>65k + (5 * blocksize)</code>.
+
+
+
+
+
+
+
Memory usage by blocksize
+
+
+
Blocksize
Compression
+ memory usage
Decompression
+ memory usage
+
+
+
100k
+
1300k
+
565k
+
+
+
200k
+
2200k
+
1065k
+
+
+
300k
+
3100k
+
1565k
+
+
+
400k
+
4000k
+
2065k
+
+
+
500k
+
4900k
+
2565k
+
+
+
600k
+
5800k
+
3065k
+
+
+
700k
+
6700k
+
3565k
+
+
+
800k
+
7600k
+
4065k
+
+
+
900k
+
8500k
+
4565k
+
+
+
+
+ For decompression CBZip2InputStream allocates less memory if the
+ bzipped input is smaller than one block.
+
+
+
+ Instances of this class are not threadsafe.
+
+
+
+ TODO: Update to BZip2 1.0.1
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo compressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if lzo decompressors are loaded & initialized,
+ else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Note: The following is valid only if the {@link OutputCommitter}
+ is {@link FileOutputCommitter}. If OutputCommitter is not
+ a FileOutputCommitter, the task's temporary output
+ directory is same as {@link #getOutputPath(JobConf)} i.e.
+ ${mapred.output.dir}$
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The generated name can be used to create custom files from within the
+ different tasks for the job, the names for different tasks will not collide
+ with each other.
+
+
The given name is postfixed with the task type, 'm' for maps, 'r' for
+ reduces and the task partition number. For example, give a name 'test'
+ running on the first map o the job the generated name will be
+ 'test-m-00000'.
+
+ @param conf the configuration for the job.
+ @param name the name to make unique.
+ @return a unique name accross all tasks of the job.]]>
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueName} method to make the file name
+ unique for the task.
+
+ @param conf the configuration for the job.
+ @param name the name for the file.
+ @return a unique path accross all tasks of the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
+
+
+ @see JobConf
+ @see ClusterStatus
+ @see Tool
+ @see DistributedCache]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If the parameter {@code loadDefaults} is false, the new instance
+ will not load resources from the default files.
+
+ @param loadDefaults specifies whether to load from the default files]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if framework should keep the intermediate files
+ for failed tasks, false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the outputs of the maps are to be compressed,
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This comparator should be provided if the equivalence rules for keys
+ for sorting the intermediates are different from those for grouping keys
+ before each call to
+ {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.
+
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is a task-level aggregation operation which, in some cases,
+ helps to cut down the amount of data transferred from the {@link Mapper} to
+ the {@link Reducer}, leading to better performance.
+
+
Typically the combiner is same as the the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see JobTracker#getNewJobId()
+ @see JobTracker#getStartTime()]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("mapred.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's cleanup-tasks, as a float between 0.0
+ and 1.0. When all cleanup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's cleanup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's setup-tasks, as a float between 0.0
+ and 1.0. When all setup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's setup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop provides an optional mode of execution in which the bad records
+ are detected and skipped in further attempts.
+
+
This feature can be used when map/reduce tasks crashes deterministically on
+ certain input. This happens due to bugs in the map/reduce function. The usual
+ course would be to fix these bugs. But sometimes this is not possible;
+ perhaps the bug is in third party libraries for which the source code is
+ not available. Due to this, the task never reaches to completion even with
+ multiple attempts and complete data for that task is lost.
+
+
With this feature, only a small portion of data is lost surrounding
+ the bad record, which may be acceptable for some user applications.
+ see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}
+
+
The skipping mode gets kicked off after certain no of failures
+ see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}
+
+
In the skipping mode, the map/reduce task maintains the record range which
+ is getting processed at all times. Before giving the input to the
+ map/reduce function, it sends this record range to the Task tracker.
+ If task crashes, the Task tracker knows which one was the last reported
+ range. On further attempts that range get skipped.
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+ mapred.join.define.<ident> to a classname. In the expression
+ mapred.join.expr, the identifier will be assumed to be a
+ ComposableRecordReader.
+ mapred.join.keycomparator can be a classname used to compare keys
+ in the join.
+ @see JoinRecordReader
+ @see MultiFilterRecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ capacity children to position
+ id in the parent reader.
+ The id of a root CompositeRecordReader is -1 by convention, but relying
+ on this is not recommended.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ override(S1,S2,S3) will prefer values
+ from S3 over S2, and values from S2 over S1 for all keys
+ emitted from all sources.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ [,,...,]]]>
+
+
+
+
+
+
+ out.
+ TupleWritable format:
+ {@code
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+
+
+ @param job job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ The Mapper classes are invoked in a chained (or piped) fashion, the output of
+ the first becomes the input of the second, and so on until the last Mapper,
+ the output of the last Mapper will be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed in a chain. This enables having
+ reusable specialized Mappers that can be combined to perform composite
+ operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain.
+
+ ChainMapper usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Reducer leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Reducer does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Reducer the configuration given for it,
+ reducerConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ @param job job's JobConf to add the Reducer class.
+ @param klass the Reducer class to add.
+ @param inputKeyClass reducer input key class.
+ @param inputValueClass reducer input value class.
+ @param outputKeyClass reducer output key class.
+ @param outputValueClass reducer output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param reducerConf a JobConf with the configuration for the Reducer
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+ .
+
+ @param job chain job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ reduce(...) method of the Reducer with the
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ For each record output by the Reducer, the Mapper classes are invoked in a
+ chained (or piped) fashion, the output of the first becomes the input of the
+ second, and so on until the last Mapper, the output of the last Mapper will
+ be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed after the Reducer or in a chain.
+ This enables having reusable specialized Mappers that can be combined to
+ perform composite operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ ChainReducer usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ @param freq The frequency with which records will be emitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ This will read every split at the client, which is very expensive.
+ @param freq Probability with which a key will be chosen.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ Takes the first numSamples / numSplits records from each split.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the name output is multi, false
+ if it is single. If the name output is not defined it returns
+ false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ @param conf job conf to enableadd the named output.
+ @param enabled indicates if the counters will be enabled or not.]]>
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+
+ @param conf job conf to enableadd the named output.
+ @return TRUE if the counters are enabled, FALSE if they are disabled.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param multiName the multi name part
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+ If overriden subclasses must invoke super.close() at the
+ end of their close()
+
+ @throws java.io.IOException thrown if any of the MultipleOutput files
+ could not be closed properly.]]>
+
+
+
+ OutputCollector passed to
+ the map() and reduce() methods of the
+ Mapper and Reducer implementations.
+
+ Each additional output, or named output, may be configured with its own
+ OutputFormat, with its own key class and with its own value
+ class.
+
+ A named output can be a single file or a multi file. The later is refered as
+ a multi named output.
+
+ A multi named output is an unbound set of files all sharing the same
+ OutputFormat, key class and value class configuration.
+
+ When named outputs are used within a Mapper implementation,
+ key/values written to a name output are not part of the reduce phase, only
+ key/values written to the job OutputCollector are part of the
+ reduce phase.
+
+ MultipleOutputs supports counters, by default the are disabled. The counters
+ group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ Job configuration usage pattern is:
+
+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+ SequenceFileOutputFormat.class,
+ LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+
+
+ Job configuration usage pattern is:
+
+
+ public class MOReduce implements
+ Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+ Alternatively, the properties can be set in the configuration with proper
+ values.
+
+ @see DBConfiguration#configureDB(JobConf, String, String, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...)
+ @see DBOutputFormat#setOutput(JobConf, String, String...)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 20070101 AND length > 0)'
+ @param orderBy the fieldNames in the orderBy clause.
+ @param fieldNames The field names in the table
+ @see #setInput(JobConf, Class, String, String)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBInputFormat emits LongWritables containing the record number as
+ key and DBWritables as value.
+
+ The SQL query, and input class can be using one of the two
+ setInput methods.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {@link DBOutputFormat} accepts <key,value> pairs, where
+ key has a type extending DBWritable. Returned {@link RecordWriter}
+ writes only the key to the database with a batch SQL query.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBWritable. DBWritable, is similar to {@link Writable}
+ except that the {@link #write(PreparedStatement)} method takes a
+ {@link PreparedStatement}, and {@link #readFields(ResultSet)}
+ takes a {@link ResultSet}.
+
+ Implementations are responsible for writing the fields of the object
+ to PreparedStatement, and reading the fields of the object from the
+ ResultSet.
+
+
Example:
+ If we have the following table in the database :
+
+ CREATE TABLE MyTable (
+ counter INTEGER NOT NULL,
+ timestamp BIGINT NOT NULL,
+ );
+
+ then we can read/write the tuples from/to the table with :
+
+ public class MyWritable implements Writable, DBWritable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ //Writable#write() implementation
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ //Writable#readFields() implementation
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public void write(PreparedStatement statement) throws SQLException {
+ statement.setInt(1, counter);
+ statement.setLong(2, timestamp);
+ }
+
+ public void readFields(ResultSet resultSet) throws SQLException {
+ counter = resultSet.getInt(1);
+ timestamp = resultSet.getLong(2);
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ io.file.buffer.size specified in the given
+ Configuration.
+ @param in input stream
+ @param conf configuration
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param conf configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/jdiff/hadoop_0.20.0.xml b/lib/jdiff/hadoop_0.20.0.xml
new file mode 100644
index 00000000000..ce6f91bfe60
--- /dev/null
+++ b/lib/jdiff/hadoop_0.20.0.xml
@@ -0,0 +1,52140 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ final.
+
+ @param name resource to be added, the classpath is examined for a file
+ with that name.]]>
+
+
+
+
+
+ final.
+
+ @param url url of the resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param file file-path of resource to be added, the local filesystem is
+ examined directly to find the resource, without referring to
+ the classpath.]]>
+
+
+
+
+
+ final.
+
+ @param in InputStream to deserialize the object from.]]>
+
+
+
+
+
+
+
+
+
+
+ name property, null if
+ no such property exists.
+
+ Values are processed for variable expansion
+ before being returned.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+ name property, without doing
+ variable expansion.
+
+ @param name the property name.
+ @return the value of the name property,
+ or null if no such property exists.]]>
+
+
+
+
+
+
+ value of the name property.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property. If no such property
+ exists, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value, or defaultValue if the property
+ doesn't exist.]]>
+
+
+
+
+
+
+ name property as an int.
+
+ If no such property exists, or if the specified value is not a valid
+ int, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as an int,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to an int.
+
+ @param name property name.
+ @param value int value of the property.]]>
+
+
+
+
+
+
+ name property as a long.
+ If no such property is specified, or if the specified value is not a valid
+ long, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a long,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a long.
+
+ @param name property name.
+ @param value long value of the property.]]>
+
+
+
+
+
+
+ name property as a float.
+ If no such property is specified, or if the specified value is not a valid
+ float, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a float,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a float.
+
+ @param name property name.
+ @param value property value.]]>
+
+
+
+
+
+
+ name property as a boolean.
+ If no such property is specified, or if the specified value is not a valid
+ boolean, then defaultValue is returned.
+
+ @param name property name.
+ @param defaultValue default value.
+ @return property value as a boolean,
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property to a boolean.
+
+ @param name property name.
+ @param value boolean value of the property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property as
+ a collection of Strings.
+ If no such property is specified then empty collection is returned.
+
+ This is an optimized version of {@link #getStrings(String)}
+
+ @param name property name.
+ @return property value as a collection of Strings.]]>
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then null is returned.
+
+ @param name property name.
+ @return property value as an array of Strings,
+ or null.]]>
+
+
+
+
+
+
+ name property as
+ an array of Strings.
+ If no such property is specified then default value is returned.
+
+ @param name property name.
+ @param defaultValue The default value
+ @return property value as an array of Strings,
+ or default value.]]>
+
+
+
+
+
+
+ name property as
+ as comma delimited values.
+
+ @param name property name.
+ @param values The values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ name property
+ as an array of Class.
+ The value of the property specifies a list of comma separated class names.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the property name.
+ @param defaultValue default value.
+ @return property value as a Class[],
+ or defaultValue.]]>
+
+
+
+
+
+
+ name property as a Class.
+ If no such property is specified, then defaultValue is
+ returned.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property as a Class
+ implementing the interface specified by xface.
+
+ If no such property is specified, then defaultValue is
+ returned.
+
+ An exception is thrown if the returned class does not implement the named
+ interface.
+
+ @param name the class name.
+ @param defaultValue default value.
+ @param xface the interface implemented by the named class.
+ @return property value as a Class,
+ or defaultValue.]]>
+
+
+
+
+
+
+
+ name property to the name of a
+ theClass implementing the given interface xface.
+
+ An exception is thrown if theClass does not implement the
+ interface xface.
+
+ @param name property name.
+ @param theClass property value.
+ @param xface the interface implemented by the named class.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+ dirsProp with
+ the given path. If dirsProp contains multiple directories,
+ then one is chosen based on path's hash code. If the selected
+ directory does not exist, an attempt is made to create it.
+
+ @param dirsProp directory in which to locate the file.
+ @param path file-path.
+ @return local file under the directory with the given path.]]>
+
+
+
+
+
+
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return an input stream attached to the resource.]]>
+
+
+
+
+
+ name.
+
+ @param name configuration resource name.
+ @return a reader attached to the resource.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ String
+ key-value pairs in the configuration.
+
+ @return an iterator over the entries.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true to set quiet-mode on, false
+ to turn it off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Resources
+
+
Configurations are specified by resources. A resource contains a set of
+ name/value pairs as XML data. Each resource is named by either a
+ String or by a {@link Path}. If named by a String,
+ then the classpath is examined for a file with that name. If named by a
+ Path, then the local filesystem is examined directly, without
+ referring to the classpath.
+
+
Unless explicitly turned off, Hadoop by default specifies two
+ resources, loaded in-order from the classpath:
core-site.xml: Site-specific configuration for a given hadoop
+ installation.
+
+ Applications may add additional resources, which are loaded
+ subsequent to these resources in the order they are added.
+
+
Final Parameters
+
+
Configuration parameters may be declared final.
+ Once a resource declares a value final, no subsequently-loaded
+ resource can alter that value.
+ For example, one might define a final parameter with:
+
+
+ When conf.get("tempdir") is called, then ${basedir}
+ will be resolved to another property in this Configuration, while
+ ${user.name} would then ordinarily be resolved to the value
+ of the System property with that name.]]>
+
Applications specify the files, via urls (hdfs:// or http://) to be cached
+ via the {@link org.apache.hadoop.mapred.JobConf}.
+ The DistributedCache assumes that the
+ files specified via hdfs:// urls are already present on the
+ {@link FileSystem} at the path specified by the url.
+
+
The framework will copy the necessary files on to the slave node before
+ any tasks for the job are executed on that node. Its efficiency stems from
+ the fact that the files are only copied once per job and the ability to
+ cache archives which are un-archived on the slaves.
+
+
DistributedCache can be used to distribute simple, read-only
+ data/text files and/or more complex types such as archives, jars etc.
+ Archives (zip, tar and tgz/tar.gz files) are un-archived at the slave nodes.
+ Jars may be optionally added to the classpath of the tasks, a rudimentary
+ software distribution mechanism. Files have execution permissions.
+ Optionally users can also direct it to symlink the distributed cache file(s)
+ into the working directory of the task.
+
+
DistributedCache tracks modification timestamps of the cache
+ files. Clearly the cache files should not be modified by the application
+ or externally while the job is executing.
+
+
Here is an illustrative example on how to use the
+ DistributedCache:
+
+ // Setting up the cache for the application
+
+ 1. Copy the requisite files to the FileSystem:
+
+ $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
+ $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
+ $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
+ $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
+ $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
+ $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
+
+ 2. Setup the application's JobConf:
+
+ JobConf job = new JobConf();
+ DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
+ job);
+ DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
+ DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
+ DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
+
+ 3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
+ or {@link org.apache.hadoop.mapred.Reducer}:
+
+ public static class MapClass extends MapReduceBase
+ implements Mapper<K, V, K, V> {
+
+ private Path[] localArchives;
+ private Path[] localFiles;
+
+ public void configure(JobConf job) {
+ // Get the cached archives/files
+ localArchives = DistributedCache.getLocalCacheArchives(job);
+ localFiles = DistributedCache.getLocalCacheFiles(job);
+ }
+
+ public void map(K key, V value,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Use data from the cached archives/files here
+ // ...
+ // ...
+ output.collect(k, v);
+ }
+ }
+
+
+ A filename pattern is composed of regular characters and
+ special pattern matching characters, which are:
+
+
+
+
+
+
?
+
Matches any single character.
+
+
+
*
+
Matches zero or more characters.
+
+
+
[abc]
+
Matches a single character from character set
+ {a,b,c}.
+
+
+
[a-b]
+
Matches a single character from the character range
+ {a...b}. Note that character a must be
+ lexicographically less than or equal to character b.
+
+
+
[^a]
+
Matches a single character that is not from character set or range
+ {a}. Note that the ^ character must occur
+ immediately to the right of the opening bracket.
+
+
+
\c
+
Removes (escapes) any special meaning of character c.
+
+
+
{ab,cd}
+
Matches a string from the string set {ab, cd}
+
+
+
{ab,c{de,fh}}
+
Matches a string from the string set {ab, cde, cfh}
+
+
+
+
+
+ @param pathPattern a regular expression specifying a pth pattern
+
+ @return an array of paths that match the path pattern
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ All user code that may potentially use the Hadoop Distributed
+ File System should be written to use a FileSystem object. The
+ Hadoop DFS is a multi-machine system that appears as a single
+ disk. It's useful because of its fault tolerance and potentially
+ very large capacity.
+
+
+ The local implementation is {@link LocalFileSystem} and distributed
+ implementation is DistributedFileSystem.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FilterFileSystem contains
+ some other file system, which it uses as
+ its basic file system, possibly transforming
+ the data along the way or providing additional
+ functionality. The class FilterFileSystem
+ itself simply overrides all methods of
+ FileSystem with versions that
+ pass all requests to the contained file
+ system. Subclasses of FilterFileSystem
+ may further override some of these methods
+ and may also provide additional methods
+ and fields.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ buf at offset
+ and checksum into checksum.
+ The method is used for implementing read, therefore, it should be optimized
+ for sequential reading
+ @param pos chunkPos
+ @param buf desitination buffer
+ @param offset offset in buf at which to store data
+ @param len maximun number of bytes to read
+ @return number of bytes read]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -1 if the end of the
+ stream is reached.
+ @exception IOException if an I/O error occurs.]]>
+
+
+
+
+
+
+
+
+ This method implements the general contract of the corresponding
+ {@link InputStream#read(byte[], int, int) read} method of
+ the {@link InputStream} class. As an additional
+ convenience, it attempts to read as many bytes as possible by repeatedly
+ invoking the read method of the underlying stream. This
+ iterated read continues until one of the following
+ conditions becomes true:
+
+
The specified number of bytes have been read,
+
+
The read method of the underlying stream returns
+ -1, indicating end-of-file.
+
+
If the first read on the underlying stream returns
+ -1 to indicate end-of-file then this method returns
+ -1. Otherwise this method returns the number of bytes
+ actually read.
+
+ @param b destination buffer.
+ @param off offset at which to start storing bytes.
+ @param len maximum number of bytes to read.
+ @return the number of bytes read, or -1 if the end of
+ the stream has been reached.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if any checksum error occurs]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ n bytes of data from the
+ input stream.
+
+
This method may skip more bytes than are remaining in the backing
+ file. This produces no exception and the number of bytes skipped
+ may include some number of bytes that were beyond the EOF of the
+ backing file. Attempting to read from the stream after skipping past
+ the end will result in -1 indicating the end of the file.
+
+
If n is negative, no bytes are skipped.
+
+ @param n the number of bytes to be skipped.
+ @return the actual number of bytes skipped.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to skip to is corrupted]]>
+
+
+
+
+
+
+ This method may seek past the end of the file.
+ This produces no exception and an attempt to read from
+ the stream will result in -1 indicating the end of the file.
+
+ @param pos the postion to seek to.
+ @exception IOException if an I/O error occurs.
+ ChecksumException if the chunk to seek to is corrupted]]>
+
+
+
+
+
+
+
+
+
+ len bytes from
+ stm
+
+ @param stm an input stream
+ @param buf destiniation buffer
+ @param offset offset at which to store data
+ @param len number of bytes to read
+ @return actual number of bytes read
+ @throws IOException if there is any IO error]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len bytes from the specified byte array
+ starting at offset off and generate a checksum for
+ each data chunk.
+
+
This method stores bytes from the given array into this
+ stream's buffer before it gets checksumed. The buffer gets checksumed
+ and flushed to the underlying output stream when all data
+ in a checksum chunk are in the buffer. If the buffer is empty and
+ requested length is at least as large as the size of next checksum chunk
+ size, this method will checksum and write the chunk directly
+ to the underlying output stream. Thus it avoids uneccessary data copy.
+
+ @param b the data.
+ @param off the start offset in the data.
+ @param len the number of bytes to write.
+ @exception IOException if an I/O error occurs.]]>
+
+
+ DataInputBuffer buffer = new DataInputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using DataInput methods ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new DataOutputStream and
+ ByteArrayOutputStream each time data is written.
+
+
Typical usage is something like the following:
+
+ DataOutputBuffer buffer = new DataOutputBuffer();
+ while (... loop condition ...) {
+ buffer.reset();
+ ... write buffer using DataOutput methods ...
+ byte[] data = buffer.getData();
+ int dataLength = buffer.getLength();
+ ... write data to its ultimate destination ...
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to store
+ @param item the object to be stored
+ @param keyName the name of the key to use
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param items the objects to be stored
+ @param keyName the name of the key to use
+ @throws IndexOutOfBoundsException if the items array is empty
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+
+
+
+
+ the class of the item
+ @param conf the configuration to use
+ @param keyName the name of the key to use
+ @param itemClass the class of the item
+ @return restored object
+ @throws IOException : forwards Exceptions from the underlying
+ {@link Serialization} classes.]]>
+
+
+
+
+ DefaultStringifier offers convenience methods to store/load objects to/from
+ the configuration.
+
+ @param the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a DoubleWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a FloatWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When two sequence files, which have same Key type but different Value
+ types, are mapped out to reduce, multiple Value types is not allowed.
+ In this case, this class can help you wrap instances with different types.
+
+
+
+ Compared with ObjectWritable, this class is much more effective,
+ because ObjectWritable will append the class declaration as a String
+ into the output file in every Key-Value pair.
+
+
+
+ Generic Writable implements {@link Configurable} interface, so that it will be
+ configured by the framework. The configuration is passed to the wrapped objects
+ implementing {@link Configurable} interface before deserialization.
+
+
+ how to use it:
+ 1. Write your own class, such as GenericObject, which extends GenericWritable.
+ 2. Implements the abstract method getTypes(), defines
+ the classes which will be wrapped in GenericObject in application.
+ Attention: this classes defined in getTypes() method, must
+ implement Writable interface.
+
+
+ @since Nov 8, 2006]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This saves memory over creating a new InputStream and
+ ByteArrayInputStream each time data is read.
+
+
Typical usage is something like the following:
+
+ InputBuffer buffer = new InputBuffer();
+ while (... loop condition ...) {
+ byte[] data = ... get data ...;
+ int dataLength = ... get data length ...;
+ buffer.reset(data, dataLength);
+ ... read buffer using InputStream methods ...
+ }
+
+ @see DataInputBuffer
+ @see DataOutput]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a IntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ closes the input and output streams
+ at the end.
+ @param in InputStrem to read from
+ @param out OutputStream to write to
+ @param conf the Configuration object]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ignore any {@link IOException} or
+ null pointers. Must only be used for cleanup in exception handlers.
+ @param log the log to record problems to at debug level. Can be null.
+ @param closeables the objects to close]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a LongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A map is a directory containing two files, the data file,
+ containing all keys and values in the map, and a smaller index
+ file, containing a fraction of the keys. The fraction is determined by
+ {@link Writer#getIndexInterval()}.
+
+
The index file is read entirely into memory. Thus key implementations
+ should try to keep themselves small.
+
+
Map files are created by adding entries in-order. To maintain a large
+ database, perform updates by copying the previous version of a database and
+ merging in a sorted change list, to create a new version of the database in
+ a new file. Sorting large change lists can be done with {@link
+ SequenceFile.Sorter}.]]>
+
SequenceFile provides {@link Writer}, {@link Reader} and
+ {@link Sorter} classes for writing, reading and sorting respectively.
+
+ There are three SequenceFileWriters based on the
+ {@link CompressionType} used to compress key/value pairs:
+
+
+ Writer : Uncompressed records.
+
+
+ RecordCompressWriter : Record-compressed files, only compress
+ values.
+
+
+ BlockCompressWriter : Block-compressed files, both keys &
+ values are collected in 'blocks'
+ separately and compressed. The size of
+ the 'block' is configurable.
+
+
+
The actual compression algorithm used to compress key and/or values can be
+ specified by using the appropriate {@link CompressionCodec}.
+
+
The recommended way is to use the static createWriter methods
+ provided by the SequenceFile to chose the preferred format.
+
+
The {@link Reader} acts as the bridge and can read any of the above
+ SequenceFile formats.
+
+
SequenceFile Formats
+
+
Essentially there are 3 different formats for SequenceFiles
+ depending on the CompressionType specified. All of them share a
+ common header described below.
+
+
SequenceFile Header
+
+
+ version - 3 bytes of magic header SEQ, followed by 1 byte of actual
+ version number (e.g. SEQ4 or SEQ6)
+
+
+ keyClassName -key class
+
+
+ valueClassName - value class
+
+
+ compression - A boolean which specifies if compression is turned on for
+ keys/values in this file.
+
+
+ blockCompression - A boolean which specifies if block-compression is
+ turned on for keys/values in this file.
+
+
+ compression codec - CompressionCodec class which is used for
+ compression of keys and/or values (if compression is
+ enabled).
+
+
+ metadata - {@link Metadata} for this file.
+
+
+ sync - A sync marker to denote end of the header.
+
The compressed blocks of key lengths and value lengths consist of the
+ actual lengths of individual keys/values encoded in ZeroCompressedInteger
+ format.
+
+ @see CompressionCodec]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key, skipping its
+ value. True if another entry exists, and false at end of file.]]>
+
+
+
+
+
+
+
+ key and
+ val. Returns true if such a pair exists and false when at
+ end of file]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The position passed must be a position returned by {@link
+ SequenceFile.Writer#getLength()} when writing this file. To seek to an arbitrary
+ position, use {@link SequenceFile.Reader#sync(long)}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ SegmentDescriptor
+ @param segments the list of SegmentDescriptors
+ @param tmpDir the directory to write temporary files into
+ @return RawKeyValueIterator
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For best performance, applications should make sure that the {@link
+ Writable#readFields(DataInput)} implementation of their keys is
+ very efficient. In particular, it should avoid allocating memory.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This always returns a synchronized position. In other words,
+ immediately after calling {@link SequenceFile.Reader#seek(long)} with a position
+ returned by this method, {@link SequenceFile.Reader#next(Writable)} may be called. However
+ the key may be earlier in the file than key last written when this
+ method was called (e.g., with block-compression, it may be the first key
+ in the block that was being written when this method was called).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ key. Returns
+ true if such a key exists and false when at the end of the set.]]>
+
+
+
+
+
+
+ key.
+ Returns key, or null if no match exists.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the objects to stringify]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ position. Note that this
+ method avoids using the converter or doing String instatiation
+ @return the Unicode scalar value at position or -1
+ if the position is invalid or points to a
+ trailing byte]]>
+
+
+
+
+
+
+
+
+
+ what in the backing
+ buffer, starting as position start. The starting
+ position is measured in bytes and the return value is in
+ terms of byte position in the buffer. The backing buffer is
+ not converted to a string for this operation.
+ @return byte position of the first occurence of the search
+ string in the UTF-8 buffer or -1 if not found]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a Text with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ replace is true, then
+ malformed input is replaced with the
+ substitution character, which is U+FFFD. Otherwise the
+ method throws a MalformedInputException.
+ @return ByteBuffer: bytes stores at ByteBuffer.array()
+ and length is ByteBuffer.limit()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ In
+ addition, it provides methods for string traversal without converting the
+ byte array to a string.
Also includes utilities for
+ serializing/deserialing a string, coding/decoding a string, checking if a
+ byte array contains valid UTF8 code, calculating the length of an encoded
+ string.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a UTF8 with the same contents.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Also includes utilities for efficiently reading and writing UTF-8.
+
+ @deprecated replaced by Text]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is useful when a class may evolve, so that instances written by the
+ old version of the class may still be processed by the new version. To
+ handle this situation, {@link #readFields(DataInput)}
+ implementations should catch {@link VersionMismatchException}.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VIntWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ o is a VLongWritable with the same value.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ out.
+
+ @param out DataOuput to serialize this object into.
+ @throws IOException]]>
+
+
+
+
+
+
+ in.
+
+
For efficiency, implementations should attempt to re-use storage in the
+ existing object where possible.
+
+ @param in DataInput to deseriablize this object from.
+ @throws IOException]]>
+
+
+
+ Any key or value type in the Hadoop Map-Reduce
+ framework implements this interface.
+
+
Implementations typically implement a static read(DataInput)
+ method which constructs a new instance, calls {@link #readFields(DataInput)}
+ and returns the instance.
+
+
Example:
+
+ public class MyWritable implements Writable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public static MyWritable read(DataInput in) throws IOException {
+ MyWritable w = new MyWritable();
+ w.readFields(in);
+ return w;
+ }
+ }
+
]]>
+
+
+
+
+
+
+
+
+ WritableComparables can be compared to each other, typically
+ via Comparators. Any type which is to be used as a
+ key in the Hadoop Map-Reduce framework should implement this
+ interface.
+
+
Example:
+
+ public class MyWritableComparable implements WritableComparable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public int compareTo(MyWritableComparable w) {
+ int thisValue = this.value;
+ int thatValue = ((IntWritable)o).value;
+ return (thisValue < thatValue ? -1 : (thisValue==thatValue ? 0 : 1));
+ }
+ }
+
One may optimize compare-intensive operations by overriding
+ {@link #compare(byte[],int,int,byte[],int,int)}. Static utility methods are
+ provided to assist in optimized implementations of this method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Enum type
+ @param in DataInput to read from
+ @param enumType Class type of Enum
+ @return Enum represented by String read from DataInput
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ len number of bytes in input streamin
+ @param in input stream
+ @param len number of bytes to skip
+ @throws IOException when skipped less number of bytes]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CompressionCodec for which to get the
+ Compressor
+ @return Compressor for the given
+ CompressionCodec from the pool or a new one]]>
+
+
+
+
+
+ CompressionCodec for which to get the
+ Decompressor
+ @return Decompressor for the given
+ CompressionCodec the pool or a new one]]>
+
+
+
+
+
+ Compressor to be returned to the pool]]>
+
+
+
+
+
+ Decompressor to be returned to the
+ pool]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Implementations are assumed to be buffered. This permits clients to
+ reposition the underlying input stream then call {@link #resetState()},
+ without having to also synchronize client buffers.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true indicating that more input data is required.
+
+ @param b Input data
+ @param off Start offset
+ @param len Length]]>
+
+
+
+
+ true if the input data buffer is empty and
+ #setInput() should be called in order to provide more input.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if a preset dictionary is needed for decompression.
+ @return true if a preset dictionary is needed for decompression]]>
+
+
+
+
+ true if the end of the compressed
+ data output stream has been reached.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FIXME: This array should be in a private or package private location,
+ since it could be modified by malicious code.
+ ]]>
+
+
+
+
+ This interface is public for historical purposes. You should have no need to
+ use it.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Although BZip2 headers are marked with the magic "Bz" this
+ constructor expects the next byte in the stream to be the first one after
+ the magic. Thus callers have to skip the first two bytes. Otherwise this
+ constructor will throw an exception.
+
+
+ @throws IOException
+ if the stream content is malformed or an I/O error occurs.
+ @throws NullPointerException
+ if in == null]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The decompression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2InputStream to release the allocated memory. See
+ {@link CBZip2OutputStream CBZip2OutputStream} for information about memory
+ usage.
+
+
+
+ CBZip2InputStream reads bytes from the compressed source stream via
+ the single byte {@link java.io.InputStream#read() read()} method exclusively.
+ Thus you should consider to use a buffered source stream.
+
+
+
+ Instances of this class are not threadsafe.
+
]]>
+
+
+
+
+
+
+
+
+
+ CBZip2OutputStream with a blocksize of 900k.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+ @param out *
+ the destination stream.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws NullPointerException
+ if out == null.]]>
+
+
+
+
+
+ CBZip2OutputStream with specified blocksize.
+
+
+ Attention: The caller is resonsible to write the two BZip2 magic
+ bytes "BZ" to the specified stream prior to calling this
+ constructor.
+
+
+
+ @param out
+ the destination stream.
+ @param blockSize
+ the blockSize as 100k units.
+
+ @throws IOException
+ if an I/O error occurs in the specified stream.
+ @throws IllegalArgumentException
+ if (blockSize < 1) || (blockSize > 9).
+ @throws NullPointerException
+ if out == null.
+
+ @see #MIN_BLOCKSIZE
+ @see #MAX_BLOCKSIZE]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ inputLength this method returns MAX_BLOCKSIZE
+ always.
+
+ @param inputLength
+ The length of the data which will be compressed by
+ CBZip2OutputStream.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ == 1.]]>
+
+
+
+
+ == 9.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If you are ever unlucky/improbable enough to get a stack overflow whilst
+ sorting, increase the following constant and try again. In practice I
+ have never seen the stack go above 27 elems, so the following limit seems
+ very generous.
+ ]]>
+
+
+
+
+ The compression requires large amounts of memory. Thus you should call the
+ {@link #close() close()} method as soon as possible, to force
+ CBZip2OutputStream to release the allocated memory.
+
+
+
+ You can shrink the amount of allocated memory and maybe raise the compression
+ speed by choosing a lower blocksize, which in turn may cause a lower
+ compression ratio. You can avoid unnecessary memory allocation by avoiding
+ using a blocksize which is bigger than the size of the input.
+
+
+
+ You can compute the memory usage for compressing by the following formula:
+
+
+
+ <code>400k + (9 * blocksize)</code>.
+
+
+
+ To get the memory required for decompression by {@link CBZip2InputStream
+ CBZip2InputStream} use
+
+
+
+ <code>65k + (5 * blocksize)</code>.
+
+
+
+
+
+
+
Memory usage by blocksize
+
+
+
Blocksize
Compression
+ memory usage
Decompression
+ memory usage
+
+
+
100k
+
1300k
+
565k
+
+
+
200k
+
2200k
+
1065k
+
+
+
300k
+
3100k
+
1565k
+
+
+
400k
+
4000k
+
2065k
+
+
+
500k
+
4900k
+
2565k
+
+
+
600k
+
5800k
+
3065k
+
+
+
700k
+
6700k
+
3565k
+
+
+
800k
+
7600k
+
4065k
+
+
+
900k
+
8500k
+
4565k
+
+
+
+
+ For decompression CBZip2InputStream allocates less memory if the
+ bzipped input is smaller than one block.
+
+
+
+ Instances of this class are not threadsafe.
+
+
+
+ TODO: Update to BZip2 1.0.1
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @return the total (non-negative) number of uncompressed bytes input so far]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-zlib is loaded & initialized
+ and can be loaded for this job, else false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying for a maximum time, waiting a fixed time between attempts,
+ and then fail by re-throwing the exception.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by the number of tries so far.
+ ]]>
+
+
+
+
+
+
+
+
+ Keep trying a limited number of times, waiting a growing amount of time between attempts,
+ and then fail by re-throwing the exception.
+ The time between attempts is sleepTime mutliplied by a random
+ number in the range of [0, 2 to the number of retries)
+ ]]>
+
+
+
+
+
+
+
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+
+
+ A retry policy for RemoteException
+ Set a default policy with some explicit handlers for specific exceptions.
+ ]]>
+
+
+
+
+
+ Try once, and fail by re-throwing the exception.
+ This corresponds to having no retry mechanism in place.
+ ]]>
+
+
+
+
+
+ Try once, and fail silently for void methods, or by
+ re-throwing the exception for non-void methods.
+ ]]>
+
+
+
+
+
+ Keep trying forever.
+ ]]>
+
+
+
+
+ A collection of useful implementations of {@link RetryPolicy}.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Determines whether the framework should retry a
+ method for the given exception, and the number
+ of retries that have been made for that operation
+ so far.
+
+ @param e The exception that caused the method to fail.
+ @param retries The number of times the method has been retried.
+ @return true if the method should be retried,
+ false if the method should not be retried
+ but shouldn't fail with an exception (only for void methods).
+ @throws Exception The re-thrown exception e indicating
+ that the method failed and should not be retried further.]]>
+
+
+
+
+ Specifies a policy for retrying method failures.
+ Implementations of this interface should be immutable.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the same retry policy for each method in the interface.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param retryPolicy the policy for retirying method call failures
+ @return the retry proxy]]>
+
+
+
+
+
+
+
+
+ Create a proxy for an interface of an implementation class
+ using the a set of retry policies specified by method name.
+ If no retry policy is defined for a method then a default of
+ {@link RetryPolicies#TRY_ONCE_THEN_FAIL} is used.
+
+ @param iface the interface that the retry will implement
+ @param implementation the instance whose methods should be retried
+ @param methodNameToPolicyMap a map of method names to retry policies
+ @return the retry proxy]]>
+
+
+
+
+ A factory for creating retry proxies.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+ Prepare the deserializer for reading.]]>
+
+
+
+
+
+
+
+ Deserialize the next object from the underlying input stream.
+ If the object t is non-null then this deserializer
+ may set its internal state to the next object read from the input
+ stream. Otherwise, if the object t is null a new
+ deserialized object will be created.
+
+ @return the deserialized object]]>
+
+
+
+
+
+ Close the underlying input stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for deserializing objects of type from an
+ {@link InputStream}.
+
+
+
+ Deserializers are stateful, but must not buffer the input since
+ other producers may read from the input between calls to
+ {@link #deserialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link Deserializer} to deserialize
+ the objects to be compared so that the standard {@link Comparator} can
+ be used to compare them.
+
+
+ One may optimize compare-intensive operations by using a custom
+ implementation of {@link RawComparator} that operates directly
+ on byte representations.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An experimental {@link Serialization} for Java {@link Serializable} classes.
+
+ @see JavaSerializationComparator]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A {@link RawComparator} that uses a {@link JavaSerialization}
+ {@link Deserializer} to deserialize objects that are then compared via
+ their {@link Comparable} interfaces.
+
+ @param
+ @see JavaSerialization]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Encapsulates a {@link Serializer}/{@link Deserializer} pair.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+ Serializations are found by reading the io.serializations
+ property from conf, which is a comma-delimited list of
+ classnames.
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A factory for {@link Serialization}s.
+ ]]>
+
+
+
+
+
+
+
+
+
+ Prepare the serializer for writing.]]>
+
+
+
+
+
+
+ Serialize t to the underlying output stream.]]>
+
+
+
+
+
+ Close the underlying output stream and clear up any resources.]]>
+
+
+
+
+ Provides a facility for serializing objects of type to an
+ {@link OutputStream}.
+
+
+
+ Serializers are stateful, but must not buffer the output since
+ other producers may write to the output between calls to
+ {@link #serialize(Object)}.
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address, returning the value. Throws exceptions if there are
+ network problems or if the remote code threw an exception.
+ @deprecated Use {@link #call(Writable, InetSocketAddress, Class, UserGroupInformation)} instead]]>
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address with the ticket credentials, returning
+ the value.
+ Throws exceptions if there are network problems or if the remote code
+ threw an exception.
+ @deprecated Use {@link #call(Writable, InetSocketAddress, Class, UserGroupInformation)} instead]]>
+
+
+
+
+
+
+
+
+
+
+ param, to the IPC server running at
+ address which is servicing the protocol protocol,
+ with the ticket credentials, returning the value.
+ Throws exceptions if there are network problems or if the remote code
+ threw an exception.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unwraps any IOException.
+
+ @param lookupTypes the desired exception class.
+ @return IOException, which is either the lookupClass exception or this.]]>
+
+
+
+
+ This unwraps any Throwable that has a constructor taking
+ a String as a parameter.
+ Otherwise it returns this.
+
+ @return Throwable]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ protocol is a Java interface. All parameters and return types must
+ be one of:
+
+
a primitive type, boolean, byte,
+ char, short, int, long,
+ float, double, or void; or
+
+
a {@link String}; or
+
+
a {@link Writable}; or
+
+
an array of the above types
+
+ All methods in the protocol should throw only IOException. No field data of
+ the protocol instance is transmitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ handlerCount determines
+ the number of handler threads that will be used to process calls.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name=RpcActivityForPort"
+
+ Many of the activity metrics are sampled and averaged on an interval
+ which can be specified in the metrics config file.
+
+ For the metrics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most metrics contexts do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically
+
+
+
+ Impl details: We use a dynamic mbean that gets the list of the metrics
+ from the metrics registry passed as an argument to the constructor]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class has a number of metrics variables that are publicly accessible;
+ these variables (objects) have methods to update their values;
+ for example:
+
{@link #rpcQueueTime}.inc(time)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ For the statistics that are sampled and averaged, one must specify
+ a metrics context that does periodic update calls. Most do.
+ The default Null metrics context however does NOT. So if you aren't
+ using any other metrics context then you can turn on the viewing and averaging
+ of sampled metrics by specifying the following two lines
+ in the hadoop-meterics.properties file:
+
+ Note that the metrics are collected regardless of the context used.
+ The context with the update thread is used to average the data periodically]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When constructing the instance, if the factory property
+ contextName.class exists,
+ its value is taken to be the name of the class to instantiate. Otherwise,
+ the default is to create an instance of
+ org.apache.hadoop.metrics.spi.NullContext, which is a
+ dummy "no-op" context which will cause all metric data to be discarded.
+
+ @param contextName the name of the context
+ @return the named MetricsContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ When the instance is constructed, this method checks if the file
+ hadoop-metrics.properties exists on the class path. If it
+ exists, it must be in the format defined by java.util.Properties, and all
+ the properties in the file are set as attributes on the newly created
+ ContextFactory instance.
+
+ @return the singleton ContextFactory instance]]>
+
+
+
+ getFactory() method.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ startMonitoring() again after calling
+ this.
+ @see #close()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A record name identifies the kind of data to be reported. For example, a
+ program reporting statistics relating to the disks on a computer might use
+ a record name "diskStats".
+
+ A record has zero or more tags. A tag has a name and a value. To
+ continue the example, the "diskStats" record might use a tag named
+ "diskName" to identify a particular disk. Sometimes it is useful to have
+ more than one tag, so there might also be a "diskType" with value "ide" or
+ "scsi" or whatever.
+
+ A record also has zero or more metrics. These are the named
+ values that are to be reported to the metrics system. In the "diskStats"
+ example, possible metric names would be "diskPercentFull", "diskPercentBusy",
+ "kbReadPerSecond", etc.
+
+ The general procedure for using a MetricsRecord is to fill in its tag and
+ metric values, and then call update() to pass the record to the
+ client library.
+ Metric data is not immediately sent to the metrics system
+ each time that update() is called.
+ An internal table is maintained, identified by the record name. This
+ table has columns
+ corresponding to the tag and the metric names, and rows
+ corresponding to each unique set of tag values. An update
+ either modifies an existing row in the table, or adds a new row with a set of
+ tag values that are different from all the other rows. Note that if there
+ are no tags, then there can be at most one row in the table.
+
+ Once a row is added to the table, its data will be sent to the metrics system
+ on every timer period, whether or not it has been updated since the previous
+ timer period. If this is inappropriate, for example if metrics were being
+ reported by some transient object in an application, the remove()
+ method can be used to remove the row and thus stop the data from being
+ sent.
+
+ Note that the update() method is atomic. This means that it is
+ safe for different threads to be updating the same metric. More precisely,
+ it is OK for different threads to call update() on MetricsRecord instances
+ with the same set of tag names and tag values. Different threads should
+ not use the same MetricsRecord instance at the same time.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MetricsContext.registerUpdater().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ fileName attribute,
+ if specified. Otherwise the data will be written to standard
+ output.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class is configured by setting ContextFactory attributes which in turn
+ are usually configured through a properties file. All the attributes are
+ prefixed by the contextName. For example, the properties file might contain:
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ contextName.tableName. The returned map consists of
+ those attributes with the contextName and tableName stripped off.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ recordName.
+ Throws an exception if the metrics implementation is configured with a fixed
+ set of record names and recordName is not in that set.
+
+ @param recordName the name of the record
+ @throws MetricsException if recordName conflicts with configuration data]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This class implements the internal table of metric data, and the timer
+ on which data is to be sent to the metrics system. Subclasses must
+ override the abstract emitRecord method in order to transmit
+ the data. ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ update
+ and remove().]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hostname or hostname:port. If
+ the specs string is null, defaults to localhost:defaultPort.
+
+ @return a list of InetSocketAddress objects.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ,name="
+ Where the and are the supplied parameters
+
+ @param serviceName
+ @param nameName
+ @param theMbean - the MBean to register
+ @return the named used to register the MBean]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ hadoop.rpc.socket.factory.class.<ClassName>. When no
+ such parameter exists then fall back on the default socket factory as
+ configured by hadoop.rpc.socket.factory.class.default. If
+ this default socket factory is not configured, then fall back on the JVM
+ default socket factory.
+
+ @param conf the configuration
+ @param clazz the class (usually a {@link VersionedProtocol})
+ @return a socket factory]]>
+
+
+
+
+
+ hadoop.rpc.socket.factory.default
+
+ @param conf the configuration
+ @return the default socket factory as specified in the configuration or
+ the JVM default socket factory if the configuration does not
+ contain a default socket factory property.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+ :
+ ://:/]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From documentation for {@link #getInputStream(Socket, long)}:
+ Returns InputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketInputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getInputStream()} is returned. In the later
+ case, the timeout argument is ignored and the timeout set with
+ {@link Socket#setSoTimeout(int)} applies for reads.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see #getInputStream(Socket, long)
+
+ @param socket
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getInputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return InputStream for reading from the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ From documentation for {@link #getOutputStream(Socket, long)} :
+ Returns OutputStream for the socket. If the socket has an associated
+ SocketChannel then it returns a
+ {@link SocketOutputStream} with the given timeout. If the socket does not
+ have a channel, {@link Socket#getOutputStream()} is returned. In the later
+ case, the timeout argument is ignored and the write will wait until
+ data is available.
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see #getOutputStream(Socket, long)
+
+ @param socket
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+ Any socket created using socket factories returned by {@link #NetUtils},
+ must use this interface instead of {@link Socket#getOutputStream()}.
+
+ @see Socket#getChannel()
+
+ @param socket
+ @param timeout timeout in milliseconds. This may not always apply. zero
+ for waiting as long as necessary.
+ @return OutputStream for writing to the socket.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+ socket.connect(endpoint, timeout). If
+ socket.getChannel() returns a non-null channel,
+ connect is implemented using Hadoop's selectors. This is done mainly
+ to avoid Sun's connect implementation from creating thread-local
+ selectors, since Hadoop does not have control on when these are closed
+ and could end up taking all the available file descriptors.
+
+ @see java.net.Socket#connect(java.net.SocketAddress, int)
+
+ @param socket
+ @param endpoint
+ @param timeout - timeout in milliseconds]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ node
+
+ @param node
+ a node
+ @return true if node is already in the tree; false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ scope
+ if scope starts with ~, choose one from the all nodes except for the
+ ones in scope; otherwise, choose one from scope
+ @param scope range of nodes from which a node will be choosen
+ @return the choosen node]]>
+
+
+
+
+
+
+ scope but not in excludedNodes
+ if scope starts with ~, return the number of nodes that are not
+ in scope and excludedNodes;
+ @param scope a path string that may start with ~
+ @param excludedNodes a list of nodes
+ @return number of available nodes]]>
+
+
+
+
+
+
+
+
+
+
+
+ reader
+ It linearly scans the array, if a local node is found, swap it with
+ the first element of the array.
+ If a local rack node is found, swap it with the first element following
+ the local node.
+ If neither local node or local rack node is found, put a random replica
+ location at postion 0.
+ It leaves the rest nodes untouched.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+ Create a new input stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+ @see SocketInputStream#SocketInputStream(ReadableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create a new ouput stream with the given timeout. If the timeout
+ is zero, it will be treated as infinite timeout. The socket's
+ channel will be configured to be non-blocking.
+
+ @see SocketOutputStream#SocketOutputStream(WritableByteChannel, long)
+
+ @param socket should have a channel associated with it.
+ @param timeout timeout timeout in milliseconds. must not be negative.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ = getCount().
+ @param newCapacity The new capacity in bytes.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Index idx = startVector(...);
+ while (!idx.done()) {
+ .... // read element of a vector
+ idx.incr();
+ }
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This task takes the given record definition files and compiles them into
+ java or c++
+ files. It is then up to the user to compile the generated files.
+
+
The task requires the file or the nested fileset element to be
+ specified. Optional attributes are language (set the output
+ language, default is "java"),
+ destdir (name of the destination directory for generated java/c++
+ code, default is ".") and failonerror (specifies error handling
+ behavior. default is true).
+
]]>
+
+
+
+
+
+
+
+
+
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cause==null ? null : cause.toString()) (which
+ typically contains the class and detail message of cause).
+ @param cause the cause (which is saved for later retrieval by the
+ {@link #getCause()} method). (A null value is
+ permitted, and indicates that the cause is nonexistent or
+ unknown.)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ Group with the given groupname.
+ @param group group name]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ugi.
+ @param ugi user
+ @return the {@link Subject} for the user identified by ugi]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ugi as a comma separated string in
+ conf as a property attr
+
+ The String starts with the user name followed by the default group names,
+ and other group names.
+
+ @param conf configuration
+ @param attr property name
+ @param ugi a UnixUserGroupInformation]]>
+
+
+
+
+
+
+
+ conf
+
+ The object is expected to store with the property name attr
+ as a comma separated string that starts
+ with the user name followed by group names.
+ If the property name is not defined, return null.
+ It's assumed that there is only one UGI per user. If this user already
+ has a UGI in the ugi map, return the ugi in the map.
+ Otherwise, construct a UGI from the configuration, store it in the
+ ugi map and return it.
+
+ @param conf configuration
+ @param attr property name
+ @return a UnixUGI
+ @throws LoginException if the stored string is ill-formatted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ User with the given username.
+ @param user user name]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cause==null ? null : cause.toString()) (which
+ typically contains the class and detail message of cause).
+ @param cause the cause (which is saved for later retrieval by the
+ {@link #getCause()} method). (A null value is
+ permitted, and indicates that the cause is nonexistent or
+ unknown.)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ does not provide the stack trace for security purposes.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ service as related to
+ Service Level Authorization for Hadoop.
+
+ Each service defines it's configuration key and also the necessary
+ {@link Permission} required to access the service.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ in]]>
+
+
+
+
+
+
+ out.]]>
+
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+ reset is true, then resets the checksum.
+ @return number of bytes written. Will be equal to getChecksumSize();]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser to parse only the generic Hadoop
+ arguments.
+
+ The array of string arguments other than the generic arguments can be
+ obtained by {@link #getRemainingArgs()}.
+
+ @param conf the Configuration to modify.
+ @param args command-line arguments.]]>
+
+
+
+
+ GenericOptionsParser to parse given options as well
+ as generic Hadoop options.
+
+ The resulting CommandLine object can be obtained by
+ {@link #getCommandLine()}.
+
+ @param conf the configuration to modify
+ @param options options built by the caller
+ @param args User-specified arguments]]>
+
+
+
+
+ Strings containing the un-parsed arguments
+ or empty array if commandLine was not defined.]]>
+
+
+
+
+
+
+
+
+
+ CommandLine object
+ to process the parsed arguments.
+
+ Note: If the object is created with
+ {@link #GenericOptionsParser(Configuration, String[])}, then returned
+ object will only contain parsed generic options.
+
+ @return CommandLine representing list of arguments
+ parsed against Options descriptor.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ GenericOptionsParser is a utility to parse command line
+ arguments generic to the Hadoop framework.
+
+ GenericOptionsParser recognizes several standarad command
+ line arguments, enabling applications to easily specify a namenode, a
+ jobtracker, additional configuration resources etc.
+
+
Generic Options
+
+
The supported generic options are:
+
+ -conf <configuration file> specify a configuration file
+ -D <property=value> use value for given property
+ -fs <local|namenode:port> specify a namenode
+ -jt <local|jobtracker:port> specify a job tracker
+ -files <comma separated list of files> specify comma separated
+ files to be copied to the map reduce cluster
+ -libjars <comma separated list of jars> specify comma separated
+ jar files to include in the classpath.
+ -archives <comma separated list of archives> specify comma
+ separated archives to be unarchived on the compute machines.
+
+
Generic command line arguments might modify
+ Configuration objects, given to constructors.
+
+
The functionality is implemented using Commons CLI.
+
+
Examples:
+
+ $ bin/hadoop dfs -fs darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
+ list /data directory in dfs with namenode darwin:8020
+
+ $ bin/hadoop dfs -conf hadoop-site.xml -ls /data
+ list /data directory in dfs with conf specified in hadoop-site.xml
+
+ $ bin/hadoop job -D mapred.job.tracker=darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt darwin:50020 -submit job.xml
+ submit a job to job tracker darwin:50020
+
+ $ bin/hadoop job -jt local -submit job.xml
+ submit a job to local runner
+
+ $ bin/hadoop jar -libjars testlib.jar
+ -archives test.tgz -files file.txt inputjar args
+ job submission with libjars, files and archives
+
+
+ @see Tool
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+ Class<T>) of the
+ argument of type T.
+ @param The type of the argument
+ @param t the object to get it class
+ @return Class<T>]]>
+
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param c the Class object of the items in the list
+ @param list the list to convert]]>
+
+
+
+
+
+ List<T> to a an array of
+ T[].
+ @param list the list to convert
+ @throws ArrayIndexOutOfBoundsException if the list is empty.
+ Use {@link #toArray(Class, List)} if the list may be empty.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ io.file.buffer.size specified in the given
+ Configuration.
+ @param in input stream
+ @param conf configuration
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if native-hadoop is loaded,
+ else false]]>
+
+
+
+
+
+ true if native hadoop libraries, if present, can be
+ used for this job; false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ { pq.top().change(); pq.adjustTop(); }
+ instead of
+ { o = pq.pop(); o.change(); pq.push(o); }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Clients and/or applications can use the provided Progressable
+ to explicitly report progress to the Hadoop framework. This is especially
+ important for operations which take an insignificant amount of time since,
+ in-lieu of the reported progress, the framework has to assume that an error
+ has occured and time-out the operation.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Class is to be obtained
+ @return the correctly typed Class of the given object.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop Pipes
+ or Hadoop Streaming.
+
+ It also checks to ensure that we are running on a *nix platform else
+ (e.g. in Cygwin/Windows) it returns null.
+ @param conf configuration
+ @return a String[] with the ulimit command arguments or
+ null if we are running on a non *nix platform or
+ if the limit is unspecified.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell interface.
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+ Shell interface.
+ @param env the map of environment key=value
+ @param cmd shell command to execute.
+ @return the output of the executed command.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Shell can be used to run unix commands like du or
+ df. It also offers facilities to gate commands by
+ time-intervals.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ShellCommandExecutorshould be used in cases where the output
+ of the command needs no explicit parsing and where the command, working
+ directory and the environment remains unchanged. The output of the command
+ is stored as-is and is expected to be small.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ArrayList of string values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the char to be escaped
+ @return an escaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ charToEscape in the string
+ with the escape char escapeChar
+
+ @param str string
+ @param escapeChar escape char
+ @param charToEscape the escaped char
+ @return an unescaped string]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool, is the standard for any Map-Reduce tool/application.
+ The tool/application should delegate the handling of
+
+ standard command-line options to {@link ToolRunner#run(Tool, String[])}
+ and only handle its custom arguments.
+
+
Here is how a typical Tool is implemented:
+
+ public class MyApp extends Configured implements Tool {
+
+ public int run(String[] args) throws Exception {
+ // Configuration processed by ToolRunner
+ Configuration conf = getConf();
+
+ // Create a JobConf using the processed conf
+ JobConf job = new JobConf(conf, MyApp.class);
+
+ // Process custom command-line options
+ Path in = new Path(args[1]);
+ Path out = new Path(args[2]);
+
+ // Specify various job-specific parameters
+ job.setJobName("my-app");
+ job.setInputPath(in);
+ job.setOutputPath(out);
+ job.setMapperClass(MyApp.MyMapper.class);
+ job.setReducerClass(MyApp.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+ }
+
+ public static void main(String[] args) throws Exception {
+ // Let ToolRunner handle generic command-line options
+ int res = ToolRunner.run(new Configuration(), new Sort(), args);
+
+ System.exit(res);
+ }
+ }
+
+
+ @see GenericOptionsParser
+ @see ToolRunner]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tool by {@link Tool#run(String[])}, after
+ parsing with the given generic arguments. Uses the given
+ Configuration, or builds one if null.
+
+ Sets the Tool's configuration with the possibly modified
+ version of the conf.
+
+ @param conf Configuration for the Tool.
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+ Tool with its Configuration.
+
+ Equivalent to run(tool.getConf(), tool, args).
+
+ @param tool Tool to run.
+ @param args command-line arguments to the tool.
+ @return exit code of the {@link Tool#run(String[])} method.]]>
+
+
+
+
+
+
+
+
+
+ ToolRunner can be used to run classes implementing
+ Tool interface. It works in conjunction with
+ {@link GenericOptionsParser} to parse the
+
+ generic hadoop command line arguments and modifies the
+ Configuration of the Tool. The
+ application-specific options are passed along without being modified.
+
+
+ @see Tool
+ @see GenericOptionsParser]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Bloom filter, as defined by Bloom in 1970.
+
+ The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by
+ the networking research community in the past decade thanks to the bandwidth efficiencies that it
+ offers for the transmission of set membership information between networked hosts. A sender encodes
+ the information into a bit vector, the Bloom filter, that is more compact than a conventional
+ representation. Computation and space costs for construction are linear in the number of elements.
+ The receiver uses the filter to test whether various elements are members of the set. Though the
+ filter will occasionally return a false positive, it will never return a false negative. When creating
+ the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size.
+
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).]]>
+
+
+
+
+
+
+
+
+ this counting Bloom filter.
+
+ Invariant: nothing happens if the specified key does not belong to this counter Bloom filter.
+ @param key The key to remove.]]>
+
+
+
+
+
+
+
+
+
+
+
+ key -> count map.
+
NOTE: due to the bucket size of this filter, inserting the same
+ key more than 15 times will cause an overflow at all filter positions
+ associated with this key, and it will significantly increase the error
+ rate for this and other keys. For this reason the filter can only be
+ used to store small count values 0 <= N << 15.
+ @param key key to be tested
+ @return 0 if the key is not present. Otherwise, a positive value v will
+ be returned such that v == count with probability equal to the
+ error rate of this filter, and v > count otherwise.
+ Additionally, if the filter experienced an underflow as a result of
+ {@link #delete(Key)} operation, the return value may be lower than the
+ count with the probability of the false negative rate of such
+ filter.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ counting Bloom filter, as defined by Fan et al. in a ToN
+ 2000 paper.
+
+ A counting Bloom filter is an improvement to standard a Bloom filter as it
+ allows dynamic additions and deletions of set membership information. This
+ is achieved through the use of a counting vector instead of a bit vector.
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Builds an empty Dynamic Bloom filter.
+ @param vectorSize The number of bits in the vector.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).
+ @param nr The threshold for the maximum number of keys to record in a
+ dynamic Bloom filter row.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ dynamic Bloom filter, as defined in the INFOCOM 2006 paper.
+
+ A dynamic Bloom filter (DBF) makes use of a s * m bit matrix but
+ each of the s rows is a standard Bloom filter. The creation
+ process of a DBF is iterative. At the start, the DBF is a 1 * m
+ bit matrix, i.e., it is composed of a single standard Bloom filter.
+ It assumes that nr elements are recorded in the
+ initial bit vector, where nr <= n (n is
+ the cardinality of the set A to record in the filter).
+
+ As the size of A grows during the execution of the application,
+ several keys must be inserted in the DBF. When inserting a key into the DBF,
+ one must first get an active Bloom filter in the matrix. A Bloom filter is
+ active when the number of recorded keys, nr, is
+ strictly less than the current cardinality of A, n.
+ If an active Bloom filter is found, the key is inserted and
+ nr is incremented by one. On the other hand, if there
+ is no active Bloom filter, a new one is created (i.e., a new row is added to
+ the matrix) according to the current size of A and the element
+ is added in this new Bloom filter and the nr value of
+ this new Bloom filter is set to one. A given key is said to belong to the
+ DBF if the k positions are set to one in one of the matrix rows.
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash functions to consider.
+ @param hashType type of the hashing function (see {@link Hash}).]]>
+
+
+
+
+
+ this filter.
+ @param key The key to add.]]>
+
+
+
+
+
+ this filter.
+ @param key The key to test.
+ @return boolean True if the specified key belongs to this filter.
+ False otherwise.]]>
+
+
+
+
+
+ this filter and a specified filter.
+
+ Invariant: The result is assigned to this filter.
+ @param filter The filter to AND with.]]>
+
+
+
+
+
+ this filter and a specified filter.
+
+ Invariant: The result is assigned to this filter.
+ @param filter The filter to OR with.]]>
+
+
+
+
+
+ this filter and a specified filter.
+
+ Invariant: The result is assigned to this filter.
+ @param filter The filter to XOR with.]]>
+
+
+
+
+ this filter.
+
+ The result is assigned to this filter.]]>
+
+
+
+
+
+ this filter.
+ @param keys The list of keys.]]>
+
+
+
+
+
+ this filter.
+ @param keys The collection of keys.]]>
+
+
+
+
+
+ this filter.
+ @param keys The array of keys.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ A filter is a data structure which aims at offering a lossy summary of a set A. The
+ key idea is to map entries of A (also called keys) into several positions
+ in a vector through the use of several hash functions.
+
+ Typically, a filter will be implemented as a Bloom filter (or a Bloom filter extension).
+
+ It must be extended in order to define the real behavior.
+
+ @see Key The general behavior of a key
+ @see HashFunction A hash function]]>
+
+
+
+
+
+
+
+
+ Builds a hash function that must obey to a given maximum number of returned values and a highest value.
+ @param maxValue The maximum highest returned value.
+ @param nbHash The number of resulting hashed values.
+ @param hashType type of the hashing function (see {@link Hash}).]]>
+
+
+
+
+ this hash function. A NOOP]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Builds a key with a default weight.
+ @param value The byte value of this key.]]>
+
+
+
+
+
+ Builds a key with a specified weight.
+ @param value The value of this key.
+ @param weight The weight associated to this key.]]>
+
+
+
+
+
+
+
+
+
+
+
+ this key.]]>
+
+
+
+
+ this key.]]>
+
+
+
+
+
+ this key with a specified value.
+ @param weight The increment.]]>
+
+
+
+
+ this key by one.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The idea is to randomly select a bit to reset.]]>
+
+
+
+
+
+ The idea is to select the bit to reset that will generate the minimum
+ number of false negative.]]>
+
+
+
+
+
+ The idea is to select the bit to reset that will remove the maximum number
+ of false positive.]]>
+
+
+
+
+
+ The idea is to select the bit to reset that will, at the same time, remove
+ the maximum number of false positve while minimizing the amount of false
+ negative generated.]]>
+
+
+
+
+ Originally created by
+ European Commission One-Lab Project 034819.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ this filter.
+ @param nbHash The number of hash function to consider.
+ @param hashType type of the hashing function (see
+ {@link org.apache.hadoop.util.hash.Hash}).]]>
+
+
+
+
+
+
+
+
+ this retouched Bloom filter.
+
+ Invariant: if the false positive is null, nothing happens.
+ @param key The false positive key to add.]]>
+
+
+
+
+
+ this retouched Bloom filter.
+ @param coll The collection of false positive.]]>
+
+
+
+
+
+ this retouched Bloom filter.
+ @param keys The list of false positive.]]>
+
+
+
+
+
+ this retouched Bloom filter.
+ @param keys The array of false positive.]]>
+
+
+
+
+
+
+ this retouched Bloom filter.
+ @param scheme The selective clearing scheme to apply.]]>
+
+
+
+
+
+
+
+
+
+
+
+ retouched Bloom filter, as defined in the CoNEXT 2006 paper.
+
+ It allows the removal of selected false positives at the cost of introducing
+ random false negatives, and with the benefit of eliminating some random false
+ positives at the same time.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ length, and
+ the provided seed value
+ @param bytes input bytes
+ @param length length of the valid bytes to consider
+ @param initval seed value
+ @return hash value]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The best hash table sizes are powers of 2. There is no need to do mod
+ a prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask.
+ For example, if you need only 10 bits, do
+ h = (h & hashmask(10));
+ In which case, the hash table should have hashsize(10) elements.
+
+
If you are hashing n strings byte[][] k, do it like this:
+ for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h);
+
+
By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
+ code any way you wish, private, educational, or commercial. It's free.
+
+
Use for hash table lookup, or anything where one collision in 2^^32 is
+ acceptable. Do NOT use for cryptographic purposes.]]>
+
+
+
+
+
+
+
+
+
+
+ lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+
+ You can use this free for any purpose. It's in the public domain.
+ It has no warranty.
+
+
+ @see lookup3.c
+ @see Hash Functions (and how this
+ function compares to others such as CRC, MD?, etc
+ @see Has update on the
+ Dr. Dobbs Article]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The C version of MurmurHash 2.0 found at that site was ported
+ to Java by Andrzej Bialecki (ab at getopt org).]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobTracker,
+ as {@link JobTracker.State}
+
+ @return the current state of the JobTracker.]]>
+
+
+
+
+ JobTracker
+
+ @return the size of heap memory used by the JobTracker]]>
+
+
+
+
+ JobTracker
+
+ @return the configured size of max heap memory that can be used by the JobTracker]]>
+
+
+
+
+
+
+
+
+
+
+
+ ClusterStatus provides clients with information such as:
+
+
+ Size of the cluster.
+
+
+ Name of the trackers.
+
+
+ Task capacity of the cluster.
+
+
+ The number of currently running map & reduce tasks.
+
+
+ State of the JobTracker.
+
+
+
+
Clients can query for the latest ClusterStatus, via
+ {@link JobClient#getClusterStatus()}.
Counters are bunched into {@link Group}s, each comprising of
+ counters from a particular Enum class.
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Counters} instead.]]>
+
Grouphandles localization of the class name and the
+ counter names.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param fs the file system that the file is on
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobConf, int)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(FileSystem, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.
+ @deprecated Use {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}
+ instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the job output should be compressed,
+ false otherwise]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tasks' Side-Effect Files
+
+
Note: The following is valid only if the {@link OutputCommitter}
+ is {@link FileOutputCommitter}. If OutputCommitter is not
+ a FileOutputCommitter, the task's temporary output
+ directory is same as {@link #getOutputPath(JobConf)} i.e.
+ ${mapred.output.dir}$
+
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in ${mapred.work.output.dir} during execution
+ of his reduce-task i.e. via {@link #getWorkOutputPath(JobConf)}, and the
+ framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
Note: the value of ${mapred.work.output.dir} during
+ execution of a particular task-attempt is actually
+ ${mapred.output.dir}/_temporary/_{$taskid}, and this value is
+ set by the map-reduce framework. So, just create any side-files in the
+ path returned by {@link #getWorkOutputPath(JobConf)} from map/reduce
+ task to take advantage of this feature.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The generated name can be used to create custom files from within the
+ different tasks for the job, the names for different tasks will not collide
+ with each other.
+
+
The given name is postfixed with the task type, 'm' for maps, 'r' for
+ reduces and the task partition number. For example, give a name 'test'
+ running on the first map o the job the generated name will be
+ 'test-m-00000'.
+
+ @param conf the configuration for the job.
+ @param name the name to make unique.
+ @return a unique name accross all tasks of the job.]]>
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueName} method to make the file name
+ unique for the task.
+
+ @param conf the configuration for the job.
+ @param name the name for the file.
+ @return a unique path accross all tasks of the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Each {@link InputSplit} is then assigned to an individual {@link Mapper}
+ for processing.
+
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple.
+
+ @param job job configuration.
+ @param numSplits the desired number of splits, a hint.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+ It is the responsibility of the RecordReader to respect
+ record boundaries while processing the logical split to present a
+ record-oriented view to the individual task.
+
+ @param split the {@link InputSplit}
+ @param job the job that this split belongs to
+ @return a {@link RecordReader}]]>
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibilty to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see JobClient
+ @see FileInputFormat
+ @deprecated Use {@link org.apache.hadoop.mapreduce.InputFormat} instead.]]>
+
+
+
+
+
+
+
+
+
+ InputSplit.
+
+ @return the number of bytes in the input split.
+ @throws IOException]]>
+
+
+
+
+
+ InputSplit is
+ located as an array of Strings.
+ @throws IOException]]>
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader
+ @deprecated Use {@link org.apache.hadoop.mapreduce.InputSplit} instead.]]>
+
+ Checking the input and output specifications of the job.
+
+
+ Computing the {@link InputSplit}s for the job.
+
+
+ Setup the requisite accounting information for the {@link DistributedCache}
+ of the job, if necessary.
+
+
+ Copying the job's jar and configuration to the map-reduce system directory
+ on the distributed file-system.
+
+
+ Submitting the job to the JobTracker and optionally monitoring
+ it's status.
+
+
+
+ Normally the user creates the application, describes various facets of the
+ job via {@link JobConf} and then uses the JobClient to submit
+ the job and monitor its progress.
+
+
Here is an example on how to use JobClient:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ job.setInputPath(new Path("in"));
+ job.setOutputPath(new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ // Submit the job, then poll for progress until the job is complete
+ JobClient.runJob(job);
+
+
+
Job Control
+
+
At times clients would chain map-reduce jobs to accomplish complex tasks
+ which cannot be done via a single map-reduce job. This is fairly easy since
+ the output of the job, typically, goes to distributed file-system and that
+ can be used as the input for the next job.
+
+
However, this also means that the onus on ensuring jobs are complete
+ (success/failure) lies squarely on the clients. In such situations the
+ various job-control options are:
+
+
+ {@link #runJob(JobConf)} : submits the job and returns only after
+ the job has completed.
+
+
+ {@link #submitJob(JobConf)} : only submits the job, then poll the
+ returned handle to the {@link RunningJob} to query status and make
+ scheduling decisions.
+
+
+ {@link JobConf#setJobEndNotificationURI(String)} : setup a notification
+ on job-completion, thus avoiding polling.
+
+
+
+ @see JobConf
+ @see ClusterStatus
+ @see Tool
+ @see DistributedCache]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If the parameter {@code loadDefaults} is false, the new instance
+ will not load resources from the default files.
+
+ @param loadDefaults specifies whether to load from the default files]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if framework should keep the intermediate files
+ for failed tasks, false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the outputs of the maps are to be compressed,
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This comparator should be provided if the equivalence rules for keys
+ for sorting the intermediates are different from those for grouping keys
+ before each call to
+ {@link Reducer#reduce(Object, java.util.Iterator, OutputCollector, Reporter)}.
+
+
For key-value pairs (K1,V1) and (K2,V2), the values (V1, V2) are passed
+ in a single call to the reduce function if K1 and K2 compare as equal.
+
+
Since {@link #setOutputKeyComparatorClass(Class)} can be used to control
+ how keys are sorted, this can be used in conjunction to simulate
+ secondary sort on values.
+
+
Note: This is not a guarantee of the reduce sort being
+ stable in any sense. (In any case, with the order of available
+ map-outputs to the reduce being non-deterministic, it wouldn't make
+ that much sense.)
+
+ @param theClass the comparator class to be used for grouping keys.
+ It should implement RawComparator.
+ @see #setOutputKeyComparatorClass(Class)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers. Typically the combiner is same as the
+ the {@link Reducer} for the job i.e. {@link #getReducerClass()}.
+
+ @return the user-defined combiner class used to combine map-outputs.]]>
+
+
+
+
+
+ combiner class used to combine map-outputs
+ before being sent to the reducers.
+
+
The combiner is an application-specified aggregation operation, which
+ can help cut down the amount of data transferred between the
+ {@link Mapper} and the {@link Reducer}, leading to better performance.
+
+
The framework may invoke the combiner 0, 1, or multiple times, in both
+ the mapper and reducer tasks. In general, the combiner is called as the
+ sort/merge result is written to disk. The combiner must:
+
+
be side-effect free
+
have the same input and output key types and the same input and
+ output value types
+
+
+
Typically the combiner is same as the Reducer for the
+ job i.e. {@link #setReducerClass(Class)}.
+
+ @param theClass the user-defined combiner class used to combine
+ map-outputs.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on, else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be
+ used for this job for map tasks,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for map tasks,
+ else false.]]>
+
+
+
+
+ true.
+
+ @return true if speculative execution be used
+ for reduce tasks for this job,
+ false otherwise.]]>
+
+
+
+
+
+ true if speculative execution
+ should be turned on for reduce tasks,
+ else false.]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ Note: This is only a hint to the framework. The actual
+ number of spawned map tasks depends on the number of {@link InputSplit}s
+ generated by the job's {@link InputFormat#getSplits(JobConf, int)}.
+
+ A custom {@link InputFormat} is typically used to accurately control
+ the number of map tasks for the job.
+
+
How many maps?
+
+
The number of maps is usually driven by the total size of the inputs
+ i.e. total number of blocks of the input files.
+
+
The right level of parallelism for maps seems to be around 10-100 maps
+ per-node, although it has been set up to 300 or so for very cpu-light map
+ tasks. Task setup takes awhile, so it is best if the maps take at least a
+ minute to execute.
+
+
The default behavior of file-based {@link InputFormat}s is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of input files. However, the {@link FileSystem} blocksize of the
+ input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Thus, if you expect 10TB of input data and have a blocksize of 128MB,
+ you'll end up with 82,000 maps, unless {@link #setNumMapTasks(int)} is
+ used to set it even higher.
+
+ @param n the number of map tasks for this job.
+ @see InputFormat#getSplits(JobConf, int)
+ @see FileInputFormat
+ @see FileSystem#getDefaultBlockSize()
+ @see FileStatus#getBlockSize()]]>
+
+
+
+
+ 1.
+
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+ How many reduces?
+
+
With 0.95 all of the reduces can launch immediately and
+ start transfering map outputs as the maps finish. With 1.75
+ the faster nodes will finish their first round of reduces and launch a
+ second wave of reduces doing a much better job of load balancing.
+
+
Increasing the number of reduces increases the framework overhead, but
+ increases load balancing and lowers the cost of failures.
+
+
The scaling factors above are slightly less than whole numbers to
+ reserve a few reduce slots in the framework for speculative-tasks, failures
+ etc.
+
+
Reducer NONE
+
+
It is legal to set the number of reduce-tasks to zero.
+
+
In this case the output of the map-tasks directly go to distributed
+ file-system, to the path set by
+ {@link FileOutputFormat#setOutputPath(JobConf, Path)}. Also, the
+ framework doesn't sort the map-outputs before writing it out to HDFS.
+
+ @param n the number of reduce tasks for this job.]]>
+
+
+
+
+ mapred.map.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per map task.]]>
+
+
+
+
+
+
+
+
+
+
+ mapred.reduce.max.attempts
+ property. If this property is not already set, the default is 4 attempts.
+
+ @return the max number of attempts per reduce task.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ noFailures, the
+ tasktracker is blacklisted for this job.
+
+ @param noFailures maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ blacklisted for this job.
+
+ @return the maximum no. of failures of a given job per tasktracker.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed map-task results in
+ the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of map tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+ failed.
+
+ Defaults to zero, i.e. any failed reduce-task results
+ in the job being declared as {@link JobStatus#FAILED}.
+
+ @return the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+ failed.
+
+ @param percent the maximum percentage of reduce tasks that can fail without
+ the job being aborted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed map tasks. The script is
+ given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script needs to be symlinked.
+
+ @param mDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+
+ The debug script can aid debugging of failed reduce tasks. The script
+ is given task's stdout, stderr, syslog, jobconf files as arguments.
+
+
The debug command, run on the node where the map failed, is:
+
+ $script $stdout $stderr $syslog $jobconf.
+
+
+
The script file is distributed through {@link DistributedCache}
+ APIs. The script file needs to be symlinked
+
+ @param rDbgScript the script name]]>
+
+
+
+
+
+
+
+
+
+ null if it hasn't
+ been set.
+ @see #setJobEndNotificationURI(String)]]>
+
+
+
+
+
+ The uri can contain 2 special parameters: $jobId and
+ $jobStatus. Those, if present, are replaced by the job's
+ identifier and completion-status respectively.
+
+
This is typically used by application-writers to implement chaining of
+ Map-Reduce jobs in an asynchronous manner.
+
+ @param uri the job end notification uri
+ @see JobStatus
+ @see Job Completion and Chaining]]>
+
+
+
+
+
+ When a job starts, a shared directory is created at location
+
+ ${mapred.local.dir}/taskTracker/jobcache/$jobid/work/ .
+ This directory is exposed to the users through
+ job.local.dir .
+ So, the tasks can use this space
+ as scratch space and share files among them.
+ This value is available as System property also.
+
+ @return The localized job specific shared directory]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ If a job doesn't specify its virtual memory requirement by setting
+ {@link #MAPRED_TASK_MAXVMEM_PROPERTY} to {@link #DISABLED_MEMORY_LIMIT},
+ tasks are assured a memory limit set to this property. This property is
+ disabled by default, and if not explicitly set to a valid value by the
+ administrators and if a job doesn't specify its virtual memory
+ requirements, the job's tasks will not be assured anything and may be
+ killed by a TT that intends to control the total memory usage of the tasks
+ via memory management functionality.
+
+
+
+ This value should in general be less than the cluster-wide configuration
+ {@link #UPPER_LIMIT_ON_TASK_VMEM_PROPERTY} . If not or if it not set,
+ TaskTracker's memory management may be disabled and a scheduler's memory
+ based scheduling decisions will be affected. Please refer to the
+ documentation of the configured scheduler to see how this property is used.]]>
+
+
+
+
+
+
+ This value will be used by TaskTrackers for monitoring the memory usage of
+ tasks of this jobs. If a TaskTracker's memory management functionality is
+ enabled, each task of this job will be allowed to use a maximum virtual
+ memory specified by this property. If the task's memory usage goes over
+ this value, the task will be failed by the TT. If not set, the cluster-wide
+ configuration {@link #MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY} is used as the
+ default value for memory requirements. If this property cascaded with
+ {@link #MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY} becomes equal to -1, job's
+ tasks will not be assured anything and may be killed by a TT that intends
+ to control the total memory usage of the tasks via memory management
+ functionality. If the memory management functionality is disabled on a TT,
+ this value is ignored.
+
+
+
+ This value should also be not more than the cluster-wide configuration
+ {@link #UPPER_LIMIT_ON_TASK_VMEM_PROPERTY} which has to be set by the site
+ administrators.
+
+
+
+ This value may be used by schedulers that support scheduling based on job's
+ memory requirements. In general, a task of this job will be scheduled on a
+ TaskTracker only if the amount of virtual memory still unoccupied on the
+ TaskTracker is greater than or equal to this value. But different
+ schedulers can take different decisions. Please refer to the documentation
+ of the scheduler being configured to see if it does memory based scheduling
+ and if it does, how this property is used by that scheduler.
+
+ @see #setMaxVirtualMemoryForTask(long)
+ @see #getMaxVirtualMemoryForTask()]]>
+
+
+
+
+
+
+ This value may be used by schedulers that support scheduling based on job's
+ memory requirements. In general, a task of this job will be scheduled on a
+ TaskTracker, only if the amount of physical memory still unoccupied on the
+ TaskTracker is greater than or equal to this value. But different
+ schedulers can take different decisions. Please refer to the documentation
+ of the scheduler being configured to see how it does memory based
+ scheduling and how this variable is used by that scheduler.
+
+ @see #setMaxPhysicalMemoryForTask(long)
+ @see #getMaxPhysicalMemoryForTask()]]>
+
+
+
+
+
+
+ If it is not set on a TaskTracker, TaskTracker's memory management will be
+ disabled.]]>
+
+
+
+ JobConf is the primary interface for a user to describe a
+ map-reduce job to the Hadoop framework for execution. The framework tries to
+ faithfully execute the job as-is described by JobConf, however:
+
+
+ Some configuration parameters might have been marked as
+
+ final by administrators and hence cannot be altered.
+
+
+ While some job parameters are straight-forward to set
+ (e.g. {@link #setNumReduceTasks(int)}), some parameters interact subtly
+ rest of the framework and/or job-configuration and is relatively more
+ complex for the user to control finely (e.g. {@link #setNumMapTasks(int)}).
+
+
+
+
JobConf typically specifies the {@link Mapper}, combiner
+ (if any), {@link Partitioner}, {@link Reducer}, {@link InputFormat} and
+ {@link OutputFormat} implementations to be used etc.
+
+
Optionally JobConf is used to specify other advanced facets
+ of the job such as Comparators to be used, files to be put in
+ the {@link DistributedCache}, whether or not intermediate and/or job outputs
+ are to be compressed (and how), debugability via user-provided scripts
+ ( {@link #setMapDebugScript(String)}/{@link #setReduceDebugScript(String)}),
+ for doing post-processing on task logs, task's stdout, stderr, syslog.
+ and etc.
+
+
Here is an example on how to configure a job via JobConf:
+
+ // Create a new JobConf
+ JobConf job = new JobConf(new Configuration(), MyJob.class);
+
+ // Specify various job-specific parameters
+ job.setJobName("myjob");
+
+ FileInputFormat.setInputPaths(job, new Path("in"));
+ FileOutputFormat.setOutputPath(job, new Path("out"));
+
+ job.setMapperClass(MyJob.MyMapper.class);
+ job.setCombinerClass(MyJob.MyReducer.class);
+ job.setReducerClass(MyJob.MyReducer.class);
+
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @return a regex pattern matching JobIDs]]>
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID]]>
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the input key.
+ @param value the input value.
+ @param output collects mapped keys and values.
+ @param reporter facility to report progress.]]>
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link JobConf} for the
+ job via the {@link JobConfigurable#configure(JobConf)} and initialize
+ themselves. Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
The framework then calls
+ {@link #map(Object, Object, OutputCollector, Reporter)}
+ for each key/value pair in the InputSplit for that task.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the grouping by specifying
+ a Comparator via
+ {@link JobConf#setOutputKeyComparatorClass(Class)}.
+
+
The grouped Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link JobConf#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
The intermediate, grouped outputs are always stored in
+ {@link SequenceFile}s. Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the JobConf.
+
+
If the job has
+ zero
+ reduces then the output of the Mapper is directly written
+ to the {@link FileSystem} without grouping by keys.
+
+
Example:
+
+ public class MyMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String mapTaskId;
+ private String inputFile;
+ private int noRecords = 0;
+
+ public void configure(JobConf job) {
+ mapTaskId = job.get("mapred.task.id");
+ inputFile = job.get("map.input.file");
+ }
+
+ public void map(K key, V val,
+ OutputCollector<K, V> output, Reporter reporter)
+ throws IOException {
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ // reporter.progress();
+
+ // Process some more
+ // ...
+ // ...
+
+ // Increment the no. of <key, value> pairs processed
+ ++noRecords;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 records update application-level status
+ if ((noRecords%100) == 0) {
+ reporter.setStatus(mapTaskId + " processed " + noRecords +
+ " from input-file: " + inputFile);
+ }
+
+ // Output the result
+ output.collect(key, val);
+ }
+ }
+
+
+
Applications may write a custom {@link MapRunnable} to exert greater
+ control on map processing e.g. multi-threaded Mappers etc.
Mapping of input records to output records is complete when this method
+ returns.
+
+ @param input the {@link RecordReader} to read the input records.
+ @param output the {@link OutputCollector} to collect the outputrecords.
+ @param reporter {@link Reporter} to report progress, status-updates etc.
+ @throws IOException]]>
+
+
+
+ Custom implementations of MapRunnable can exert greater
+ control on map processing e.g. multi-threaded, asynchronous mappers etc.
+
+ @see Mapper
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Mapper} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nearly
+ equal content length.
+ Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
+ to construct RecordReader's for MultiFileSplit's.
+ @see MultiFileSplit
+ @deprecated Use {@link org.apache.hadoop.mapred.lib.CombineFileInputFormat} instead]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiFileSplit can be used to implement {@link RecordReader}'s, with
+ reading one record per file.
+ @see FileSplit
+ @see MultiFileInputFormat
+ @deprecated Use {@link org.apache.hadoop.mapred.lib.CombineFileSplit} instead]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ <key, value> pairs output by {@link Mapper}s
+ and {@link Reducer}s.
+
+
OutputCollector is the generalization of the facility
+ provided by the Map-Reduce framework to collect data output by either the
+ Mapper or the Reducer i.e. intermediate outputs
+ or the output of the job.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext
+ @deprecated Use {@link org.apache.hadoop.mapreduce.OutputCommitter} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param ignored
+ @param job job configuration.
+ @throws IOException when output should not be attempted]]>
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter
+ @see JobConf
+ @deprecated Use {@link org.apache.hadoop.mapreduce.OutputFormat} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be paritioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Partitioner} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if there exists a key/value,
+ false otherwise.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RawKeyValueIterator is an iterator used to iterate over
+ the raw keys and values during sort/merge of intermediate data.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0.0 to 1.0.
+ @throws IOException]]>
+
+
+
+ RecordReader reads <key, value> pairs from an
+ {@link InputSplit}.
+
+
RecordReader, typically, converts the byte-oriented view of
+ the input, provided by the InputSplit, and presents a
+ record-oriented view for the {@link Mapper} & {@link Reducer} tasks for
+ processing. It thus assumes the responsibility of processing record
+ boundaries and presenting the tasks with keys and values.
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reduces values for a given key.
+
+
The framework calls this method for each
+ <key, (list of values)> pair in the grouped inputs.
+ Output values must be of the same type as input values. Input keys must
+ not be altered. The framework will reuse the key and value objects
+ that are passed into the reduce, therefore the application should clone
+ the objects they want to keep a copy of. In many cases, all values are
+ combined into zero or one value.
+
+
+
Output pairs are collected with calls to
+ {@link OutputCollector#collect(Object,Object)}.
+
+
Applications can use the {@link Reporter} provided to report progress
+ or just indicate that they are alive. In scenarios where the application
+ takes an insignificant amount of time to process individual key/value
+ pairs, this is crucial since the framework might assume that the task has
+ timed-out and kill that task. The other way of avoiding this is to set
+
+ mapred.task.timeout to a high-enough value (or even zero for no
+ time-outs).
+
+ @param key the key.
+ @param values the list of values to reduce.
+ @param output to collect keys and combined values.
+ @param reporter facility to report progress.]]>
+
+
+
+ The number of Reducers for the job is set by the user via
+ {@link JobConf#setNumReduceTasks(int)}. Reducer implementations
+ can access the {@link JobConf} for the job via the
+ {@link JobConfigurable#configure(JobConf)} method and initialize themselves.
+ Similarly they can use the {@link Closeable#close()} method for
+ de-initialization.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
Reducer is input the grouped output of a {@link Mapper}.
+ In the phase the framework, for each Reducer, fetches the
+ relevant partition of the output of all the Mappers, via HTTP.
+
+
+
+
+
Sort
+
+
The framework groups Reducer inputs by keys
+ (since different Mappers may have output the same key) in this
+ stage.
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
If equivalence rules for keys while grouping the intermediates are
+ different from those for grouping keys before reduction, then one may
+ specify a Comparator via
+ {@link JobConf#setOutputValueGroupingComparator(Class)}.Since
+ {@link JobConf#setOutputKeyComparatorClass(Class)} can be used to
+ control how intermediate keys are grouped, these can be used in conjunction
+ to simulate secondary sort on values.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterator, OutputCollector, Reporter)}
+ method is called for each <key, (list of values)> pair in
+ the grouped inputs.
+
The output of the reduce task is typically written to the
+ {@link FileSystem} via
+ {@link OutputCollector#collect(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class MyReducer<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Reducer<K, V, K, V> {
+
+ static enum MyCounters { NUM_RECORDS }
+
+ private String reduceTaskId;
+ private int noKeys = 0;
+
+ public void configure(JobConf job) {
+ reduceTaskId = job.get("mapred.task.id");
+ }
+
+ public void reduce(K key, Iterator<V> values,
+ OutputCollector<K, V> output,
+ Reporter reporter)
+ throws IOException {
+
+ // Process
+ int noValues = 0;
+ while (values.hasNext()) {
+ V value = values.next();
+
+ // Increment the no. of values for this key
+ ++noValues;
+
+ // Process the <key, value> pair (assume this takes a while)
+ // ...
+ // ...
+
+ // Let the framework know that we are alive, and kicking!
+ if ((noValues%10) == 0) {
+ reporter.progress();
+ }
+
+ // Process some more
+ // ...
+ // ...
+
+ // Output the <key, value>
+ output.collect(key, value);
+ }
+
+ // Increment the no. of <key, list of values> pairs processed
+ ++noKeys;
+
+ // Increment counters
+ reporter.incrCounter(NUM_RECORDS, 1);
+
+ // Every 100 keys update application-level status
+ if ((noKeys%100) == 0) {
+ reporter.setStatus(reduceTaskId + " processed " + noKeys);
+ }
+ }
+ }
+
+
+ @see Mapper
+ @see Partitioner
+ @see Reporter
+ @see MapReduceBase
+ @deprecated Use {@link org.apache.hadoop.mapreduce.Reducer} instead.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Counter of the given group/name.]]>
+
+
+
+
+
+
+ Enum.
+ @param amount A non-negative amount by which the counter is to
+ be incremented.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit that the map is reading from.
+ @throws UnsupportedOperationException if called outside a mapper]]>
+
+
+
+
+
+
+
+
+ {@link Mapper} and {@link Reducer} can use the Reporter
+ provided to report progress or just indicate that they are alive. In
+ scenarios where the application takes an insignificant amount of time to
+ process individual key/value pairs, this is crucial since the framework
+ might assume that the task has timed-out and kill that task.
+
+
Applications can also update {@link Counters} via the provided
+ Reporter .
+
+ @see Progressable
+ @see Counters]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's cleanup-tasks, as a float between 0.0
+ and 1.0. When all cleanup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's cleanup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's setup-tasks, as a float between 0.0
+ and 1.0. When all setup tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's setup-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RunningJob is the user-interface to query for details on a
+ running Map-Reduce job.
+
+
Clients can get hold of RunningJob via the {@link JobClient}
+ and then query the running-job for details such as name, configuration,
+ progress etc.
+
+ @see JobClient]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This allows the user to specify the key class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+ This allows the user to specify the value class to be different
+ from the actual class ({@link BytesWritable}) used for writing
+
+ @param conf the {@link JobConf} to modify
+ @param theClass the SequenceFile output key class.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f. The filtering criteria is
+ MD5(key) % f == 0.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ f using
+ the criteria record# % f == 0.
+ For example, if the frequency is 10, one out of 10 records is returned.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_MAP_PROCESSED_RECORDS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ true if auto increment
+ {@link SkipBadRecords#COUNTER_REDUCE_PROCESSED_GROUPS}.
+ false otherwise.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Hadoop provides an optional mode of execution in which the bad records
+ are detected and skipped in further attempts.
+
+
This feature can be used when map/reduce tasks crashes deterministically on
+ certain input. This happens due to bugs in the map/reduce function. The usual
+ course would be to fix these bugs. But sometimes this is not possible;
+ perhaps the bug is in third party libraries for which the source code is
+ not available. Due to this, the task never reaches to completion even with
+ multiple attempts and complete data for that task is lost.
+
+
With this feature, only a small portion of data is lost surrounding
+ the bad record, which may be acceptable for some user applications.
+ see {@link SkipBadRecords#setMapperMaxSkipRecords(Configuration, long)}
+
+
The skipping mode gets kicked off after certain no of failures
+ see {@link SkipBadRecords#setAttemptsToStartSkipping(Configuration, int)}
+
+
In the skipping mode, the map/reduce task maintains the record range which
+ is getting processed at all times. Before giving the input to the
+ map/reduce function, it sends this record range to the Task tracker.
+ If task crashes, the Task tracker knows which one was the last reported
+ range. On further attempts that range get skipped.
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all task attempt IDs
+ of any jobtracker, in any job, of the first
+ map task, we would use :
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @param attemptId the task attempt number, or null
+ @return a regex pattern matching TaskAttemptIDs]]>
+
+
+
+
+ An example TaskAttemptID is :
+ attempt_200707121733_0003_m_000005_0 , which represents the
+ zeroth task attempt for the fifth map task in the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+ @param jtIdentifier jobTracker identifier, or null
+ @param jobId job number, or null
+ @param isMap whether the tip is a map, or null
+ @param taskId taskId number, or null
+ @return a regex pattern matching TaskIDs]]>
+
+
+
+
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+
+
+
+
+ (tbl(,),tbl(,),...,tbl(,)) }]]>
+
+
+
+ mapred.join.define.<ident> to a classname. In the expression
+ mapred.join.expr, the identifier will be assumed to be a
+ ComposableRecordReader.
+ mapred.join.keycomparator can be a classname used to compare keys
+ in the join.
+ @see JoinRecordReader
+ @see MultiFilterRecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ capacity children to position
+ id in the parent reader.
+ The id of a root CompositeRecordReader is -1 by convention, but relying
+ on this is not recommended.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ override(S1,S2,S3) will prefer values
+ from S3 over S2, and values from S2 over S1 for all keys
+ emitted from all sources.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ [,,...,]]]>
+
+
+
+
+
+
+ out.
+ TupleWritable format:
+ {@code
+ ......
+ }]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+
+
+ @param job job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ The Mapper classes are invoked in a chained (or piped) fashion, the output of
+ the first becomes the input of the second, and so on until the last Mapper,
+ the output of the last Mapper will be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed in a chain. This enables having
+ reusable specialized Mappers that can be combined to perform composite
+ operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain.
+
+ ChainMapper usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Reducer leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Reducer does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Reducer the configuration given for it,
+ reducerConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ @param job job's JobConf to add the Reducer class.
+ @param klass the Reducer class to add.
+ @param inputKeyClass reducer input key class.
+ @param inputValueClass reducer input value class.
+ @param outputKeyClass reducer output key class.
+ @param outputValueClass reducer output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param reducerConf a JobConf with the configuration for the Reducer
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It has to be specified how key and values are passed from one element of
+ the chain to the next, by value or by reference. If a Mapper leverages the
+ assumed semantics that the key and values are not modified by the collector
+ 'by value' must be used. If the Mapper does not expect this semantics, as
+ an optimization to avoid serialization and deserialization 'by reference'
+ can be used.
+
+ For the added Mapper the configuration given for it,
+ mapperConf, have precedence over the job's JobConf. This
+ precedence is in effect when the task is running.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainMapper, this is done by the addMapper for the last mapper in the chain
+ .
+
+ @param job chain job's JobConf to add the Mapper class.
+ @param klass the Mapper class to add.
+ @param inputKeyClass mapper input key class.
+ @param inputValueClass mapper input value class.
+ @param outputKeyClass mapper output key class.
+ @param outputValueClass mapper output value class.
+ @param byValue indicates if key/values should be passed by value
+ to the next Mapper in the chain, if any.
+ @param mapperConf a JobConf with the configuration for the Mapper
+ class. It is recommended to use a JobConf without default values using the
+ JobConf(boolean loadDefaults) constructor with FALSE.]]>
+
+
+
+
+
+
+ If this method is overriden super.configure(...) should be
+ invoked at the beginning of the overwriter method.]]>
+
+
+
+
+
+
+
+
+
+ reduce(...) method of the Reducer with the
+ map(...) methods of the Mappers in the chain.]]>
+
+
+
+
+
+
+ If this method is overriden super.close() should be
+ invoked at the end of the overwriter method.]]>
+
+
+
+
+ For each record output by the Reducer, the Mapper classes are invoked in a
+ chained (or piped) fashion, the output of the first becomes the input of the
+ second, and so on until the last Mapper, the output of the last Mapper will
+ be written to the task's output.
+
+ The key functionality of this feature is that the Mappers in the chain do not
+ need to be aware that they are executed after the Reducer or in a chain.
+ This enables having reusable specialized Mappers that can be combined to
+ perform composite operations within a single task.
+
+ Special care has to be taken when creating chains that the key/values output
+ by a Mapper are valid for the following Mapper in the chain. It is assumed
+ all Mappers and the Reduce in the chain use maching output and input key and
+ value classes as no conversion is done by the chaining code.
+
+ Using the ChainMapper and the ChainReducer classes is possible to compose
+ Map/Reduce jobs that look like [MAP+ / REDUCE MAP*]. And
+ immediate benefit of this pattern is a dramatic reduction in disk IO.
+
+ IMPORTANT: There is no need to specify the output key/value classes for the
+ ChainReducer, this is done by the setReducer or the addMapper for the last
+ element in the chain.
+
+ ChainReducer usage pattern:
+
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RecordReader's for CombineFileSplit's.
+ @see CombineFileSplit]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+ th Path]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ CombineFileSplit can be used to implement {@link org.apache.hadoop.mapred.RecordReader}'s,
+ with reading one record per file.
+ @see org.apache.hadoop.mapred.FileSplit
+ @see CombineFileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ @param freq The frequency with which records will be emitted.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ This will read every split at the client, which is very expensive.
+ @param freq Probability with which a key will be chosen.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ all splits.
+ Takes the first numSamples / numSplits records from each split.
+ @param numSamples Total number of samples to obtain from all selected
+ splits.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true if the name output is multi, false
+ if it is single. If the name output is not defined it returns
+ false]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+
+
+
+
+ @param conf job conf to add the named output
+ @param namedOutput named output name, it has to be a word, letters
+ and numbers only, cannot be the word 'part' as
+ that is reserved for the
+ default output.
+ @param outputFormatClass OutputFormat class.
+ @param keyClass key class
+ @param valueClass value class]]>
+
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ @param conf job conf to enableadd the named output.
+ @param enabled indicates if the counters will be enabled or not.]]>
+
+
+
+
+
+
+ By default these counters are disabled.
+
+ MultipleOutputs supports counters, by default the are disabled.
+ The counters group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+
+ @param conf job conf to enableadd the named output.
+ @return TRUE if the counters are enabled, FALSE if they are disabled.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+
+
+
+
+ @param namedOutput the named output name
+ @param multiName the multi name part
+ @param reporter the reporter
+ @return the output collector for the given named output
+ @throws IOException thrown if output collector could not be created]]>
+
+
+
+
+
+
+ If overriden subclasses must invoke super.close() at the
+ end of their close()
+
+ @throws java.io.IOException thrown if any of the MultipleOutput files
+ could not be closed properly.]]>
+
+
+
+ OutputCollector passed to
+ the map() and reduce() methods of the
+ Mapper and Reducer implementations.
+
+ Each additional output, or named output, may be configured with its own
+ OutputFormat, with its own key class and with its own value
+ class.
+
+ A named output can be a single file or a multi file. The later is refered as
+ a multi named output.
+
+ A multi named output is an unbound set of files all sharing the same
+ OutputFormat, key class and value class configuration.
+
+ When named outputs are used within a Mapper implementation,
+ key/values written to a name output are not part of the reduce phase, only
+ key/values written to the job OutputCollector are part of the
+ reduce phase.
+
+ MultipleOutputs supports counters, by default the are disabled. The counters
+ group is the {@link MultipleOutputs} class name.
+
+ The names of the counters are the same as the named outputs. For multi
+ named outputs the name of the counter is the concatenation of the named
+ output, and underscore '_' and the multiname.
+
+ Job configuration usage pattern is:
+
+
+ JobConf conf = new JobConf();
+
+ conf.setInputPath(inDir);
+ FileOutputFormat.setOutputPath(conf, outDir);
+
+ conf.setMapperClass(MOMap.class);
+ conf.setReducerClass(MOReduce.class);
+ ...
+
+ // Defines additional single text based output 'text' for the job
+ MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class,
+ LongWritable.class, Text.class);
+
+ // Defines additional multi sequencefile based output 'sequence' for the
+ // job
+ MultipleOutputs.addMultiNamedOutput(conf, "seq",
+ SequenceFileOutputFormat.class,
+ LongWritable.class, Text.class);
+ ...
+
+ JobClient jc = new JobClient();
+ RunningJob job = jc.submitJob(conf);
+
+ ...
+
+
+ Job configuration usage pattern is:
+
+
+ public class MOReduce implements
+ Reducer<WritableComparable, Writable> {
+ private MultipleOutputs mos;
+
+ public void configure(JobConf conf) {
+ ...
+ mos = new MultipleOutputs(conf);
+ }
+
+ public void reduce(WritableComparable key, Iterator<Writable> values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ ...
+ mos.getCollector("text", reporter).collect(key, new Text("Hello"));
+ mos.getCollector("seq", "A", reporter).collect(key, new Text("Bye"));
+ mos.getCollector("seq", "B", reporter).collect(key, new Text("Chau"));
+ ...
+ }
+
+ public void close() throws IOException {
+ mos.close();
+ ...
+ }
+
+ }
+
]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Map implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured to use this MapRunnable class (using
+ the JobConf.setMapRunnerClass method) and
+ the number of thread the thread-pool can use with the
+ mapred.map.multithreadedrunner.threads property, its default
+ value is 10 threads.
+
+ Alternatively, the properties can be set in the configuration with proper
+ values.
+
+ @see DBConfiguration#configureDB(JobConf, String, String, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String)
+ @see DBInputFormat#setInput(JobConf, Class, String, String, String, String...)
+ @see DBOutputFormat#setOutput(JobConf, String, String...)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 20070101 AND length > 0)'
+ @param orderBy the fieldNames in the orderBy clause.
+ @param fieldNames The field names in the table
+ @see #setInput(JobConf, Class, String, String)]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBInputFormat emits LongWritables containing the record number as
+ key and DBWritables as value.
+
+ The SQL query, and input class can be using one of the two
+ setInput methods.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {@link DBOutputFormat} accepts <key,value> pairs, where
+ key has a type extending DBWritable. Returned {@link RecordWriter}
+ writes only the key to the database with a batch SQL query.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DBWritable. DBWritable, is similar to {@link Writable}
+ except that the {@link #write(PreparedStatement)} method takes a
+ {@link PreparedStatement}, and {@link #readFields(ResultSet)}
+ takes a {@link ResultSet}.
+
+ Implementations are responsible for writing the fields of the object
+ to PreparedStatement, and reading the fields of the object from the
+ ResultSet.
+
+
Example:
+ If we have the following table in the database :
+
+ CREATE TABLE MyTable (
+ counter INTEGER NOT NULL,
+ timestamp BIGINT NOT NULL,
+ );
+
+ then we can read/write the tuples from/to the table with :
+
+ public class MyWritable implements Writable, DBWritable {
+ // Some data
+ private int counter;
+ private long timestamp;
+
+ //Writable#write() implementation
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(counter);
+ out.writeLong(timestamp);
+ }
+
+ //Writable#readFields() implementation
+ public void readFields(DataInput in) throws IOException {
+ counter = in.readInt();
+ timestamp = in.readLong();
+ }
+
+ public void write(PreparedStatement statement) throws SQLException {
+ statement.setInt(1, counter);
+ statement.setLong(2, timestamp);
+ }
+
+ public void readFields(ResultSet resultSet) throws SQLException {
+ counter = resultSet.getInt(1);
+ timestamp = resultSet.getLong(2);
+ }
+ }
+
Note: The split is a logical split of the inputs and the
+ input files are not physically split into chunks. For e.g. a split could
+ be <input-file-path, start, offset> tuple. The InputFormat
+ also creates the {@link RecordReader} to read the {@link InputSplit}.
+
+ @param context job configuration.
+ @return an array of {@link InputSplit}s for the job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ InputFormat describes the input-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the InputFormat of the
+ job to:
+
+
+ Validate the input-specification of the job.
+
+ Split-up the input file(s) into logical {@link InputSplit}s, each of
+ which is then assigned to an individual {@link Mapper}.
+
+
+ Provide the {@link RecordReader} implementation to be used to glean
+ input records from the logical InputSplit for processing by
+ the {@link Mapper}.
+
+
+
+
The default behavior of file-based {@link InputFormat}s, typically
+ sub-classes of {@link FileInputFormat}, is to split the
+ input into logical {@link InputSplit}s based on the total size, in
+ bytes, of the input files. However, the {@link FileSystem} blocksize of
+ the input files is treated as an upper bound for input splits. A lower bound
+ on the split size can be set via
+
+ mapred.min.split.size.
+
+
Clearly, logical splits based on input-size is insufficient for many
+ applications since record boundaries are to respected. In such cases, the
+ application has to also implement a {@link RecordReader} on whom lies the
+ responsibility to respect record-boundaries and present a record-oriented
+ view of the logical InputSplit to the individual task.
+
+ @see InputSplit
+ @see RecordReader
+ @see FileInputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputSplit represents the data to be processed by an
+ individual {@link Mapper}.
+
+
Typically, it presents a byte-oriented view on the input and is the
+ responsibility of {@link RecordReader} of the job to process this and present
+ a record-oriented view.
+
+ @see InputFormat
+ @see RecordReader]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ InputFormat to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+ OutputFormat to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+ Mapper to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reducer to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+ Partitioner to use
+ @throws IllegalStateException if the job is submitted]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ progress of the job's map-tasks, as a float between 0.0
+ and 1.0. When all map tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's map-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ progress of the job's reduce-tasks, as a float between 0.0
+ and 1.0. When all reduce tasks have completed, the function returns 1.0.
+
+ @return the progress of the job's reduce-tasks.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job is complete, else false.
+ @throws IOException]]>
+
+
+
+
+
+ true if the job succeeded, else false.
+ @throws IOException]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ JobTracker is lost]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1.
+ @return the number of reduce tasks for this job.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An example JobID is :
+ job_200707121733_0003 , which represents the third job
+ running at the jobtracker started at 200707121733.
+
+ Applications should never construct or parse JobID strings, but rather
+ use appropriate constructors or {@link #forName(String)} method.
+
+ @see TaskID
+ @see TaskAttemptID
+ @see org.apache.hadoop.mapred.JobTracker#getNewJobId()
+ @see org.apache.hadoop.mapred.JobTracker#getStartTime()]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the key input type to the Mapper
+ @param the value input type to the Mapper
+ @param the key output type from the Mapper
+ @param the value output type from the Mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Maps are the individual tasks which transform input records into a
+ intermediate records. The transformed intermediate records need not be of
+ the same type as the input records. A given input pair may map to zero or
+ many output pairs.
+
+
The Hadoop Map-Reduce framework spawns one map task for each
+ {@link InputSplit} generated by the {@link InputFormat} for the job.
+ Mapper implementations can access the {@link Configuration} for
+ the job via the {@link JobContext#getConfiguration()}.
+
+
The framework first calls
+ {@link #setup(org.apache.hadoop.mapreduce.Mapper.Context)}, followed by
+ {@link #map(Object, Object, Context)}
+ for each key/value pair in the InputSplit. Finally
+ {@link #cleanup(Context)} is called.
+
+
All intermediate values associated with a given output key are
+ subsequently grouped by the framework, and passed to a {@link Reducer} to
+ determine the final output. Users can control the sorting and grouping by
+ specifying two key {@link RawComparator} classes.
+
+
The Mapper outputs are partitioned per
+ Reducer. Users can control which keys (and hence records) go to
+ which Reducer by implementing a custom {@link Partitioner}.
+
+
Users can optionally specify a combiner, via
+ {@link Job#setCombinerClass(Class)}, to perform local aggregation of the
+ intermediate outputs, which helps to cut down the amount of data transferred
+ from the Mapper to the Reducer.
+
+
Applications can specify if and how the intermediate
+ outputs are to be compressed and which {@link CompressionCodec}s are to be
+ used via the Configuration.
+
+
If the job has zero
+ reduces then the output of the Mapper is directly written
+ to the {@link OutputFormat} without sorting by keys.
+
+
Example:
+
+ public class TokenCounterMapper
+ extends Mapper
+
+
Applications may override the {@link #run(Context)} method to exert
+ greater control on map processing e.g. multi-threaded Mappers
+ etc.
The Map-Reduce framework relies on the OutputCommitter of
+ the job to:
+
+
+ Setup the job during initialization. For example, create the temporary
+ output directory for the job during the initialization of the job.
+
+
+ Cleanup the job after the job completion. For example, remove the
+ temporary output directory after the job completion.
+
+
+ Setup the task temporary output.
+
+
+ Check whether a task needs a commit. This is to avoid the commit
+ procedure if a task does not need commit.
+
+
+ Commit of the task output.
+
+
+ Discard the task commit.
+
+
+
+ @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+ @see JobContext
+ @see TaskAttemptContext]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is to validate the output specification for the job when it is
+ a job is submitted. Typically checks that it does not already exist,
+ throwing an exception when it already exists, so that output is not
+ overwritten.
+
+ @param context information about the job
+ @throws IOException when output should not be attempted]]>
+
+
+
+
+
+
+
+
+
+
+
+ OutputFormat describes the output-specification for a
+ Map-Reduce job.
+
+
The Map-Reduce framework relies on the OutputFormat of the
+ job to:
+
+
+ Validate the output-specification of the job. For e.g. check that the
+ output directory doesn't already exist.
+
+ Provide the {@link RecordWriter} implementation to be used to write out
+ the output files of the job. Output files are stored in a
+ {@link FileSystem}.
+
+
+
+ @see RecordWriter]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ Typically a hash function on a all or a subset of the key.
+
+ @param key the key to be partioned.
+ @param value the entry value.
+ @param numPartitions the total number of partitions.
+ @return the partition number for the key.]]>
+
+
+
+ Partitioner controls the partitioning of the keys of the
+ intermediate map-outputs. The key (or a subset of the key) is used to derive
+ the partition, typically by a hash function. The total number of partitions
+ is the same as the number of reduce tasks for the job. Hence this controls
+ which of the m reduce tasks the intermediate key (and hence the
+ record) is sent for reduction.
+
+ @see Reducer]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @param ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ RecordWriter to future operations.
+
+ @param context the context of the task
+ @throws IOException]]>
+
+
+
+ RecordWriter writes the output <key, value> pairs
+ to an output file.
+
+
RecordWriter implementations write the job outputs to the
+ {@link FileSystem}.
+
+ @see OutputFormat]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the class of the input keys
+ @param the class of the input values
+ @param the class of the output keys
+ @param the class of the output values]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Reducer implementations
+ can access the {@link Configuration} for the job via the
+ {@link JobContext#getConfiguration()} method.
+
+
Reducer has 3 primary phases:
+
+
+
+
Shuffle
+
+
The Reducer copies the sorted output from each
+ {@link Mapper} using HTTP across the network.
+
+
+
+
Sort
+
+
The framework merge sorts Reducer inputs by
+ keys
+ (since different Mappers may have output the same key).
+
+
The shuffle and sort phases occur simultaneously i.e. while outputs are
+ being fetched they are merged.
+
+
SecondarySort
+
+
To achieve a secondary sort on the values returned by the value
+ iterator, the application should extend the key with the secondary
+ key and define a grouping comparator. The keys will be sorted using the
+ entire key, but will be grouped using the grouping comparator to decide
+ which keys and values are sent in the same call to reduce.The grouping
+ comparator is specified via
+ {@link Job#setGroupingComparatorClass(Class)}. The sort order is
+ controlled by
+ {@link Job#setSortComparatorClass(Class)}.
+
+
+ For example, say that you want to find duplicate web pages and tag them
+ all with the url of the "best" known example. You would set up the job
+ like:
+
+
Map Input Key: url
+
Map Input Value: document
+
Map Output Key: document checksum, url pagerank
+
Map Output Value: url
+
Partitioner: by checksum
+
OutputKeyComparator: by checksum and then decreasing pagerank
+
OutputValueGroupingComparator: by checksum
+
+
+
+
+
Reduce
+
+
In this phase the
+ {@link #reduce(Object, Iterable, Context)}
+ method is called for each <key, (collection of values)> in
+ the sorted inputs.
+
The output of the reduce task is typically written to a
+ {@link RecordWriter} via
+ {@link Context#write(Object, Object)}.
+
+
+
+
The output of the Reducer is not re-sorted.
+
+
Example:
+
+ public class IntSumReducer extends Reducer {
+ private IntWritable result = new IntWritable();
+
+ public void reduce(Key key, Iterable values,
+ Context context) throws IOException {
+ int sum = 0;
+ for (IntWritable val : values) {
+ sum += val.get();
+ }
+ result.set(sum);
+ context.collect(key, result);
+ }
+ }
+
+ Applications should never construct or parse TaskAttemptID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskID]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ An example TaskID is :
+ task_200707121733_0003_m_000005 , which represents the
+ fifth map task in the third job running at the jobtracker
+ started at 200707121733.
+
+ Applications should never construct or parse TaskID strings
+ , but rather use appropriate constructors or {@link #forName(String)}
+ method.
+
+ @see JobID
+ @see TaskAttemptID]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the input key type for the task
+ @param the input value type for the task
+ @param the output key type for the task
+ @param the output value type for the task]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat implementations can override this and return
+ false to ensure that individual input files are never split-up
+ so that {@link Mapper}s process entire files.
+
+ @param context the job context
+ @param filename the file name to check
+ @return is this file splitable?]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ FileInputFormat is the base class for all file-based
+ InputFormats. This provides a generic implementation of
+ {@link #getSplits(JobContext)}.
+ Subclasses of FileInputFormat can also override the
+ {@link #isSplitable(JobContext, Path)} method to ensure input-files are
+ not split-up and are processed as a whole by {@link Mapper}s.]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ the map's input key type
+ @param the map's input value type
+ @param the map's output key type
+ @param the map's output value type
+ @param job the job
+ @return the mapper class to run]]>
+
+
+
+
+
+
+ the map input key type
+ @param the map input value type
+ @param the map output key type
+ @param the map output value type
+ @param job the job to modify
+ @param cls the class to use as the mapper]]>
+
+
+
+
+
+
+
+
+
+
+
+
+ It can be used instead of the default implementation,
+ @link org.apache.hadoop.mapred.MapRunner, when the Map operation is not CPU
+ bound in order to improve throughput.
+
+ Mapper implementations using this MapRunnable must be thread-safe.
+
+ The Map-Reduce job has to be configured with the mapper to use via
+ {@link #setMapperClass(Configuration, Class)} and
+ the number of thread the thread-pool can use with the
+ {@link #getNumberOfThreads(Configuration) method. The default
+ value is 10 threads.
+
Some applications need to create/write-to side-files, which differ from
+ the actual job-outputs.
+
+
In such cases there could be issues with 2 instances of the same TIP
+ (running simultaneously e.g. speculative tasks) trying to open/write-to the
+ same file (path) on HDFS. Hence the application-writer will have to pick
+ unique names per task-attempt (e.g. using the attemptid, say
+ attempt_200709221812_0001_m_000000_0), not just per TIP.
+
+
To get around this the Map-Reduce framework helps the application-writer
+ out by maintaining a special
+ ${mapred.output.dir}/_temporary/_${taskid}
+ sub-directory for each task-attempt on HDFS where the output of the
+ task-attempt goes. On successful completion of the task-attempt the files
+ in the ${mapred.output.dir}/_temporary/_${taskid} (only)
+ are promoted to ${mapred.output.dir}. Of course, the
+ framework discards the sub-directory of unsuccessful task-attempts. This
+ is completely transparent to the application.
+
+
The application-writer can take advantage of this by creating any
+ side-files required in a work directory during execution
+ of his task i.e. via
+ {@link #getWorkOutputPath(TaskInputOutputContext)}, and
+ the framework will move them out similarly - thus she doesn't have to pick
+ unique paths per task-attempt.
+
+
The entire discussion holds true for maps of jobs with
+ reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
+ goes directly to HDFS.
+
+ @return the {@link Path} to the task's temporary output directory
+ for the map-reduce job.]]>
+
+
+
+
+
+
+
+
+
+ The path can be used to create custom files from within the map and
+ reduce tasks. The path name will be unique for each task. The path parent
+ will be the job output directory.ls
+
+
This method uses the {@link #getUniqueFile} method to make the file name
+ unique for the task.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/kfs-0.2.2.jar b/lib/kfs-0.2.2.jar
new file mode 100644
index 00000000000..aa32e74baf2
Binary files /dev/null and b/lib/kfs-0.2.2.jar differ
diff --git a/lib/kfs-0.2.LICENSE.txt b/lib/kfs-0.2.LICENSE.txt
new file mode 100644
index 00000000000..d6456956733
--- /dev/null
+++ b/lib/kfs-0.2.LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/test/org/apache/hadoop/cli/TestCLI.java b/src/test/org/apache/hadoop/cli/TestCLI.java
new file mode 100644
index 00000000000..306733cefd2
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/TestCLI.java
@@ -0,0 +1,450 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.cli.util.CLITestData;
+import org.apache.hadoop.cli.util.CommandExecutor;
+import org.apache.hadoop.cli.util.ComparatorBase;
+import org.apache.hadoop.cli.util.ComparatorData;
+import org.apache.hadoop.cli.util.CLITestData.TestCmd;
+import org.apache.hadoop.cli.util.CLITestData.TestCmd.CommandType;
+import org.apache.hadoop.cli.util.CommandExecutor.Result;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.security.authorize.ServiceAuthorizationManager;
+import org.apache.hadoop.util.StringUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Tests for the Command Line Interface (CLI)
+ */
+public class TestCLI extends TestCase {
+ private static final Log LOG =
+ LogFactory.getLog(TestCLI.class.getName());
+
+ // In this mode, it runs the command and compares the actual output
+ // with the expected output
+ public static final String TESTMODE_TEST = "test"; // Run the tests
+
+ // If it is set to nocompare, run the command and do not compare.
+ // This can be useful populate the testConfig.xml file the first time
+ // a new command is added
+ public static final String TESTMODE_NOCOMPARE = "nocompare";
+ public static final String TEST_CACHE_DATA_DIR =
+ System.getProperty("test.cache.data", "build/test/cache");
+
+ //By default, run the tests. The other mode is to run the commands and not
+ // compare the output
+ protected String testMode = TESTMODE_TEST;
+
+ // Storage for tests read in from the config file
+ protected ArrayList testsFromConfigFile = null;
+ protected ArrayList testComparators = null;
+ protected String thisTestCaseName = null;
+ protected ComparatorData comparatorData = null;
+ protected Configuration conf = null;
+ protected String clitestDataDir = null;
+ protected String username = null;
+
+ /**
+ * Read the test config file - testConfig.xml
+ */
+ protected void readTestConfigFile() {
+ String testConfigFile = getTestFile();
+ if (testsFromConfigFile == null) {
+ boolean success = false;
+ testConfigFile = TEST_CACHE_DATA_DIR + File.separator + testConfigFile;
+ try {
+ SAXParser p = (SAXParserFactory.newInstance()).newSAXParser();
+ p.parse(testConfigFile, new TestConfigFileParser());
+ success = true;
+ } catch (Exception e) {
+ LOG.info("File: " + testConfigFile + " not found");
+ success = false;
+ }
+ assertTrue("Error reading test config file", success);
+ }
+ }
+
+ protected String getTestFile() {
+ return "testConf.xml";
+ }
+
+ /*
+ * Setup
+ */
+ public void setUp() throws Exception {
+ // Read the testConfig.xml file
+ readTestConfigFile();
+
+ conf = new Configuration();
+ conf.setBoolean(ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG,
+ true);
+
+ clitestDataDir = new File(TEST_CACHE_DATA_DIR).
+ toURI().toString().replace(' ', '+');
+ }
+
+ /**
+ * Tear down
+ */
+ public void tearDown() throws Exception {
+ displayResults();
+ }
+
+ /**
+ * Expand the commands from the test config xml file
+ * @param cmd
+ * @return String expanded command
+ */
+ protected String expandCommand(final String cmd) {
+ String expCmd = cmd;
+ expCmd = expCmd.replaceAll("CLITEST_DATA", clitestDataDir);
+ expCmd = expCmd.replaceAll("USERNAME", username);
+
+ return expCmd;
+ }
+
+ /**
+ * Display the summarized results
+ */
+ private void displayResults() {
+ LOG.info("Detailed results:");
+ LOG.info("----------------------------------\n");
+
+ for (int i = 0; i < testsFromConfigFile.size(); i++) {
+ CLITestData td = testsFromConfigFile.get(i);
+
+ boolean testResult = td.getTestResult();
+
+ // Display the details only if there is a failure
+ if (!testResult) {
+ LOG.info("-------------------------------------------");
+ LOG.info(" Test ID: [" + (i + 1) + "]");
+ LOG.info(" Test Description: [" + td.getTestDesc() + "]");
+ LOG.info("");
+
+ ArrayList testCommands = td.getTestCommands();
+ for (TestCmd cmd : testCommands) {
+ LOG.info(" Test Commands: [" +
+ expandCommand(cmd.getCmd()) + "]");
+ }
+
+ LOG.info("");
+ ArrayList cleanupCommands = td.getCleanupCommands();
+ for (TestCmd cmd : cleanupCommands) {
+ LOG.info(" Cleanup Commands: [" +
+ expandCommand(cmd.getCmd()) + "]");
+ }
+
+ LOG.info("");
+ ArrayList compdata = td.getComparatorData();
+ for (ComparatorData cd : compdata) {
+ boolean resultBoolean = cd.getTestResult();
+ LOG.info(" Comparator: [" +
+ cd.getComparatorType() + "]");
+ LOG.info(" Comparision result: [" +
+ (resultBoolean ? "pass" : "fail") + "]");
+ LOG.info(" Expected output: [" +
+ cd.getExpectedOutput() + "]");
+ LOG.info(" Actual output: [" +
+ cd.getActualOutput() + "]");
+ }
+ LOG.info("");
+ }
+ }
+
+ LOG.info("Summary results:");
+ LOG.info("----------------------------------\n");
+
+ boolean overallResults = true;
+ int totalPass = 0;
+ int totalFail = 0;
+ int totalComparators = 0;
+ for (int i = 0; i < testsFromConfigFile.size(); i++) {
+ CLITestData td = testsFromConfigFile.get(i);
+ totalComparators +=
+ testsFromConfigFile.get(i).getComparatorData().size();
+ boolean resultBoolean = td.getTestResult();
+ if (resultBoolean) {
+ totalPass ++;
+ } else {
+ totalFail ++;
+ }
+ overallResults &= resultBoolean;
+ }
+
+
+ LOG.info(" Testing mode: " + testMode);
+ LOG.info("");
+ LOG.info(" Overall result: " +
+ (overallResults ? "+++ PASS +++" : "--- FAIL ---"));
+ if ((totalPass + totalFail) == 0) {
+ LOG.info(" # Tests pass: " + 0);
+ LOG.info(" # Tests fail: " + 0);
+ }
+ else
+ {
+ LOG.info(" # Tests pass: " + totalPass +
+ " (" + (100 * totalPass / (totalPass + totalFail)) + "%)");
+ LOG.info(" # Tests fail: " + totalFail +
+ " (" + (100 * totalFail / (totalPass + totalFail)) + "%)");
+ }
+
+ LOG.info(" # Validations done: " + totalComparators +
+ " (each test may do multiple validations)");
+
+ LOG.info("");
+ LOG.info("Failing tests:");
+ LOG.info("--------------");
+ int i = 0;
+ boolean foundTests = false;
+ for (i = 0; i < testsFromConfigFile.size(); i++) {
+ boolean resultBoolean = testsFromConfigFile.get(i).getTestResult();
+ if (!resultBoolean) {
+ LOG.info((i + 1) + ": " +
+ testsFromConfigFile.get(i).getTestDesc());
+ foundTests = true;
+ }
+ }
+ if (!foundTests) {
+ LOG.info("NONE");
+ }
+
+ foundTests = false;
+ LOG.info("");
+ LOG.info("Passing tests:");
+ LOG.info("--------------");
+ for (i = 0; i < testsFromConfigFile.size(); i++) {
+ boolean resultBoolean = testsFromConfigFile.get(i).getTestResult();
+ if (resultBoolean) {
+ LOG.info((i + 1) + ": " +
+ testsFromConfigFile.get(i).getTestDesc());
+ foundTests = true;
+ }
+ }
+ if (!foundTests) {
+ LOG.info("NONE");
+ }
+
+ assertTrue("One of the tests failed. " +
+ "See the Detailed results to identify " +
+ "the command that failed", overallResults);
+
+ }
+
+ /**
+ * Compare the actual output with the expected output
+ * @param compdata
+ * @return
+ */
+ private boolean compareTestOutput(ComparatorData compdata, Result cmdResult) {
+ // Compare the output based on the comparator
+ String comparatorType = compdata.getComparatorType();
+ Class> comparatorClass = null;
+
+ // If testMode is "test", then run the command and compare the output
+ // If testMode is "nocompare", then run the command and dump the output.
+ // Do not compare
+
+ boolean compareOutput = false;
+
+ if (testMode.equals(TESTMODE_TEST)) {
+ try {
+ // Initialize the comparator class and run its compare method
+ comparatorClass = Class.forName("org.apache.hadoop.cli.util." +
+ comparatorType);
+ ComparatorBase comp = (ComparatorBase) comparatorClass.newInstance();
+ compareOutput = comp.compare(cmdResult.getCommandOutput(),
+ compdata.getExpectedOutput());
+ } catch (Exception e) {
+ LOG.info("Error in instantiating the comparator" + e);
+ }
+ }
+
+ return compareOutput;
+ }
+
+ /***********************************
+ ************* TESTS
+ *********************************/
+
+ public void testAll() {
+ LOG.info("TestAll");
+
+ // Run the tests defined in the testConf.xml config file.
+ for (int index = 0; index < testsFromConfigFile.size(); index++) {
+
+ CLITestData testdata = (CLITestData) testsFromConfigFile.get(index);
+
+ // Execute the test commands
+ ArrayList testCommands = testdata.getTestCommands();
+ Result cmdResult = null;
+ for (TestCmd cmd : testCommands) {
+ try {
+ cmdResult = execute(cmd);
+ } catch (Exception e) {
+ fail(StringUtils.stringifyException(e));
+ }
+ }
+
+ boolean overallTCResult = true;
+ // Run comparators
+ ArrayList compdata = testdata.getComparatorData();
+ for (ComparatorData cd : compdata) {
+ final String comptype = cd.getComparatorType();
+
+ boolean compareOutput = false;
+
+ if (! comptype.equalsIgnoreCase("none")) {
+ compareOutput = compareTestOutput(cd, cmdResult);
+ overallTCResult &= compareOutput;
+ }
+
+ cd.setExitCode(cmdResult.getExitCode());
+ cd.setActualOutput(cmdResult.getCommandOutput());
+ cd.setTestResult(compareOutput);
+ }
+ testdata.setTestResult(overallTCResult);
+
+ // Execute the cleanup commands
+ ArrayList cleanupCommands = testdata.getCleanupCommands();
+ for (TestCmd cmd : cleanupCommands) {
+ try {
+ execute(cmd);
+ } catch (Exception e) {
+ fail(StringUtils.stringifyException(e));
+ }
+ }
+ }
+ }
+
+ protected CommandExecutor.Result execute(TestCmd cmd) throws Exception {
+ throw new Exception("Unknow type of Test command:"+ cmd.getType());
+ }
+
+ /*
+ * Parser class for the test config xml file
+ */
+ class TestConfigFileParser extends DefaultHandler {
+ String charString = null;
+ CLITestData td = null;
+ ArrayList testCommands = null;
+ ArrayList cleanupCommands = null;
+
+ @Override
+ public void startDocument() throws SAXException {
+ testsFromConfigFile = new ArrayList();
+ }
+
+ @Override
+ public void startElement(String uri,
+ String localName,
+ String qName,
+ Attributes attributes) throws SAXException {
+ if (qName.equals("test")) {
+ td = new CLITestData();
+ } else if (qName.equals("test-commands")) {
+ testCommands = new ArrayList();
+ } else if (qName.equals("cleanup-commands")) {
+ cleanupCommands = new ArrayList();
+ } else if (qName.equals("comparators")) {
+ testComparators = new ArrayList();
+ } else if (qName.equals("comparator")) {
+ comparatorData = new ComparatorData();
+ }
+ charString = "";
+ }
+
+ @Override
+ public void endElement(String uri,
+ String localName,
+ String qName) throws SAXException {
+ if (qName.equals("description")) {
+ td.setTestDesc(charString);
+ } else if (qName.equals("test-commands")) {
+ td.setTestCommands(testCommands);
+ testCommands = null;
+ } else if (qName.equals("cleanup-commands")) {
+ td.setCleanupCommands(cleanupCommands);
+ cleanupCommands = null;
+ } else if (qName.equals("command")) {
+ if (testCommands != null) {
+ testCommands.add(new TestCmd(charString, CommandType.FS));
+ } else if (cleanupCommands != null) {
+ cleanupCommands.add(new TestCmd(charString, CommandType.FS));
+ }
+ } else if (qName.equals("dfs-admin-command")) {
+ if (testCommands != null) {
+ testCommands.add(new TestCmd(charString, CommandType.DFSADMIN));
+ } else if (cleanupCommands != null) {
+ cleanupCommands.add(new TestCmd(charString, CommandType.DFSADMIN));
+ }
+ } else if (qName.equals("mr-admin-command")) {
+ if (testCommands != null) {
+ testCommands.add(new TestCmd(charString, CommandType.MRADMIN));
+ } else if (cleanupCommands != null) {
+ cleanupCommands.add(new TestCmd(charString, CommandType.MRADMIN));
+ }
+ } else if (qName.equals("archive-command")) {
+ if (testCommands != null) {
+ testCommands.add(new TestCmd(charString, CommandType.ARCHIVE));
+ } else if (cleanupCommands != null) {
+ cleanupCommands.add(new TestCmd(charString, CommandType.ARCHIVE));
+ }
+ } else if (qName.equals("comparators")) {
+ td.setComparatorData(testComparators);
+ } else if (qName.equals("comparator")) {
+ testComparators.add(comparatorData);
+ } else if (qName.equals("type")) {
+ comparatorData.setComparatorType(charString);
+ } else if (qName.equals("expected-output")) {
+ comparatorData.setExpectedOutput(charString);
+ } else if (qName.equals("test")) {
+ testsFromConfigFile.add(td);
+ td = null;
+ } else if (qName.equals("mode")) {
+ testMode = charString;
+ if (!testMode.equals(TESTMODE_NOCOMPARE) &&
+ !testMode.equals(TESTMODE_TEST)) {
+ testMode = TESTMODE_TEST;
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch,
+ int start,
+ int length) throws SAXException {
+ String s = new String(ch, start, length);
+ charString += s;
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/cli/testConf.xml b/src/test/org/apache/hadoop/cli/testConf.xml
new file mode 100644
index 00000000000..3250aa67a18
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/testConf.xml
@@ -0,0 +1,18 @@
+
+
+
+
+
+ test
+
+
+
+
+
+
diff --git a/src/test/org/apache/hadoop/cli/testConf.xsl b/src/test/org/apache/hadoop/cli/testConf.xsl
new file mode 100644
index 00000000000..09fb0b7a500
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/testConf.xsl
@@ -0,0 +1,28 @@
+
+
+
+
+
+
+
+
Hadoop DFS command-line tests
+
+
+
ID
+
Command
+
Description
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/test/org/apache/hadoop/cli/util/CLITestData.java b/src/test/org/apache/hadoop/cli/util/CLITestData.java
new file mode 100644
index 00000000000..18a7133218f
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/CLITestData.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+import java.util.ArrayList;
+
+/**
+ *
+ * Class to store CLI Test Data
+ */
+public class CLITestData {
+ private String testDesc = null;
+ private ArrayList testCommands = null;
+ private ArrayList cleanupCommands = null;
+ private ArrayList comparatorData = null;
+ private boolean testResult = false;
+
+ public CLITestData() {
+
+ }
+
+ /**
+ * Class to define Test Command. includes type of the command and command itself
+ * Valid types FS, DFSADMIN, MRADMIN and ARCHIVE.
+ */
+ static public class TestCmd {
+ public enum CommandType {
+ FS,
+ DFSADMIN,
+ MRADMIN,
+ ARCHIVE
+ }
+ private final CommandType type;
+ private final String cmd;
+
+ public TestCmd(String str, CommandType type) {
+ cmd = str;
+ this.type = type;
+ }
+ public CommandType getType() {
+ return type;
+ }
+ public String getCmd() {
+ return cmd;
+ }
+ public String toString() {
+ return cmd;
+ }
+ }
+
+ /**
+ * @return the testDesc
+ */
+ public String getTestDesc() {
+ return testDesc;
+ }
+
+ /**
+ * @param testDesc the testDesc to set
+ */
+ public void setTestDesc(String testDesc) {
+ this.testDesc = testDesc;
+ }
+
+ /**
+ * @return the testCommands
+ */
+ public ArrayList getTestCommands() {
+ return testCommands;
+ }
+
+ /**
+ * @param testCommands the testCommands to set
+ */
+ public void setTestCommands(ArrayList testCommands) {
+ this.testCommands = testCommands;
+ }
+
+ /**
+ * @return the comparatorData
+ */
+ public ArrayList getComparatorData() {
+ return comparatorData;
+ }
+
+ /**
+ * @param comparatorData the comparatorData to set
+ */
+ public void setComparatorData(ArrayList comparatorData) {
+ this.comparatorData = comparatorData;
+ }
+
+ /**
+ * @return the testResult
+ */
+ public boolean getTestResult() {
+ return testResult;
+ }
+
+ /**
+ * @param testResult the testResult to set
+ */
+ public void setTestResult(boolean testResult) {
+ this.testResult = testResult;
+ }
+
+ /**
+ * @return the cleanupCommands
+ */
+ public ArrayList getCleanupCommands() {
+ return cleanupCommands;
+ }
+
+ /**
+ * @param cleanupCommands the cleanupCommands to set
+ */
+ public void setCleanupCommands(ArrayList cleanupCommands) {
+ this.cleanupCommands = cleanupCommands;
+ }
+}
diff --git a/src/test/org/apache/hadoop/cli/util/CommandExecutor.java b/src/test/org/apache/hadoop/cli/util/CommandExecutor.java
new file mode 100644
index 00000000000..7a0dc462a06
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/CommandExecutor.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.cli.TestCLI;
+
+/**
+ *
+ * This class execute commands and captures the output
+ */
+public abstract class CommandExecutor {
+ protected String[] getCommandAsArgs(final String cmd, final String masterKey,
+ final String master) {
+ StringTokenizer tokenizer = new StringTokenizer(cmd, " ");
+ String[] args = new String[tokenizer.countTokens()];
+
+ int i = 0;
+ while (tokenizer.hasMoreTokens()) {
+ args[i] = tokenizer.nextToken();
+
+ args[i] = args[i].replaceAll(masterKey, master);
+ args[i] = args[i].replaceAll("CLITEST_DATA",
+ new File(TestCLI.TEST_CACHE_DATA_DIR).
+ toURI().toString().replace(' ', '+'));
+ args[i] = args[i].replaceAll("USERNAME", System.getProperty("user.name"));
+
+ i++;
+ }
+
+ return args;
+ }
+
+ public Result executeCommand(final String cmd) throws Exception {
+ int exitCode = 0;
+ Exception lastException = null;
+
+
+ ByteArrayOutputStream bao = new ByteArrayOutputStream();
+ PrintStream origOut = System.out;
+ PrintStream origErr = System.err;
+
+ System.setOut(new PrintStream(bao));
+ System.setErr(new PrintStream(bao));
+
+ try {
+ execute(cmd);
+ } catch (Exception e) {
+ e.printStackTrace();
+ lastException = e;
+ exitCode = -1;
+ } finally {
+ System.setOut(origOut);
+ System.setErr(origErr);
+ }
+ return new Result(bao.toString(), exitCode, lastException, cmd);
+ }
+
+ protected abstract void execute(final String cmd) throws Exception;
+
+ public static class Result {
+ final String commandOutput;
+ final int exitCode;
+ final Exception exception;
+ final String cmdExecuted;
+ public Result(String commandOutput, int exitCode, Exception exception,
+ String cmdExecuted) {
+ this.commandOutput = commandOutput;
+ this.exitCode = exitCode;
+ this.exception = exception;
+ this.cmdExecuted = cmdExecuted;
+ }
+
+ public String getCommandOutput() {
+ return commandOutput;
+ }
+
+ public int getExitCode() {
+ return exitCode;
+ }
+
+ public Exception getException() {
+ return exception;
+ }
+
+ public String getCommand() {
+ return cmdExecuted;
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/cli/util/ComparatorBase.java b/src/test/org/apache/hadoop/cli/util/ComparatorBase.java
new file mode 100644
index 00000000000..fae99377a42
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/ComparatorBase.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+/**
+ *
+ * Comparator interface. To define a new comparator, implement the compare
+ * method
+ */
+public abstract class ComparatorBase {
+ public ComparatorBase() {
+
+ }
+
+ /**
+ * Compare method for the comparator class.
+ * @param actual output. can be null
+ * @param expected output. can be null
+ * @return true if expected output compares with the actual output, else
+ * return false. If actual or expected is null, return false
+ */
+ public abstract boolean compare(String actual, String expected);
+}
diff --git a/src/test/org/apache/hadoop/cli/util/ComparatorData.java b/src/test/org/apache/hadoop/cli/util/ComparatorData.java
new file mode 100644
index 00000000000..1b24777e4c5
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/ComparatorData.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+/**
+ *
+ * Class to store CLI Test Comparators Data
+ */
+public class ComparatorData {
+ private String expectedOutput = null;
+ private String actualOutput = null;
+ private boolean testResult = false;
+ private int exitCode = 0;
+ private String comparatorType = null;
+
+ public ComparatorData() {
+
+ }
+
+ /**
+ * @return the expectedOutput
+ */
+ public String getExpectedOutput() {
+ return expectedOutput;
+ }
+
+ /**
+ * @param expectedOutput the expectedOutput to set
+ */
+ public void setExpectedOutput(String expectedOutput) {
+ this.expectedOutput = expectedOutput;
+ }
+
+ /**
+ * @return the actualOutput
+ */
+ public String getActualOutput() {
+ return actualOutput;
+ }
+
+ /**
+ * @param actualOutput the actualOutput to set
+ */
+ public void setActualOutput(String actualOutput) {
+ this.actualOutput = actualOutput;
+ }
+
+ /**
+ * @return the testResult
+ */
+ public boolean getTestResult() {
+ return testResult;
+ }
+
+ /**
+ * @param testResult the testResult to set
+ */
+ public void setTestResult(boolean testResult) {
+ this.testResult = testResult;
+ }
+
+ /**
+ * @return the exitCode
+ */
+ public int getExitCode() {
+ return exitCode;
+ }
+
+ /**
+ * @param exitCode the exitCode to set
+ */
+ public void setExitCode(int exitCode) {
+ this.exitCode = exitCode;
+ }
+
+ /**
+ * @return the comparatorType
+ */
+ public String getComparatorType() {
+ return comparatorType;
+ }
+
+ /**
+ * @param comparatorType the comparatorType to set
+ */
+ public void setComparatorType(String comparatorType) {
+ this.comparatorType = comparatorType;
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/cli/util/ExactComparator.java b/src/test/org/apache/hadoop/cli/util/ExactComparator.java
new file mode 100644
index 00000000000..9a49a960ce0
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/ExactComparator.java
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+/**
+ * Comparator for the Command line tests.
+ *
+ * This comparator compares the actual to the expected and
+ * returns true only if they are the same
+ *
+ */
+public class ExactComparator extends ComparatorBase {
+
+ @Override
+ public boolean compare(String actual, String expected) {
+ return actual.equals(expected);
+ }
+}
diff --git a/src/test/org/apache/hadoop/cli/util/RegexpAcrossOutputComparator.java b/src/test/org/apache/hadoop/cli/util/RegexpAcrossOutputComparator.java
new file mode 100644
index 00000000000..9285bde9454
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/RegexpAcrossOutputComparator.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+import java.util.regex.Pattern;
+
+/**
+ * Comparator for command line tests that attempts to find a regexp
+ * within the entire text returned by a command.
+ *
+ * This comparator differs from RegexpComparator in that it attempts
+ * to match the pattern within all of the text returned by the command,
+ * rather than matching against each line of the returned text. This
+ * allows matching against patterns that span multiple lines.
+ */
+public class RegexpAcrossOutputComparator extends ComparatorBase {
+
+ @Override
+ public boolean compare(String actual, String expected) {
+ return Pattern.compile(expected).matcher(actual).find();
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/cli/util/RegexpComparator.java b/src/test/org/apache/hadoop/cli/util/RegexpComparator.java
new file mode 100644
index 00000000000..f2477466c12
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/RegexpComparator.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+import java.util.StringTokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Comparator for the Command line tests.
+ *
+ * This comparator searches for the regular expression specified in 'expected'
+ * in the string 'actual' and returns true if the regular expression match is
+ * done
+ *
+ */
+public class RegexpComparator extends ComparatorBase {
+
+ @Override
+ public boolean compare(String actual, String expected) {
+ boolean success = false;
+ Pattern p = Pattern.compile(expected);
+
+ StringTokenizer tokenizer = new StringTokenizer(actual, "\n\r");
+ while (tokenizer.hasMoreTokens() && !success) {
+ String actualToken = tokenizer.nextToken();
+ Matcher m = p.matcher(actualToken);
+ success = m.matches();
+ }
+
+ return success;
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/cli/util/SubstringComparator.java b/src/test/org/apache/hadoop/cli/util/SubstringComparator.java
new file mode 100644
index 00000000000..79e9a889fd8
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/SubstringComparator.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+public class SubstringComparator extends ComparatorBase {
+
+ @Override
+ public boolean compare(String actual, String expected) {
+ int compareOutput = actual.indexOf(expected);
+ if (compareOutput == -1) {
+ return false;
+ }
+
+ return true;
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/cli/util/TokenComparator.java b/src/test/org/apache/hadoop/cli/util/TokenComparator.java
new file mode 100644
index 00000000000..ce5b8468c5b
--- /dev/null
+++ b/src/test/org/apache/hadoop/cli/util/TokenComparator.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.cli.util;
+
+import java.util.StringTokenizer;
+
+/**
+ * Comparator for the Command line tests.
+ *
+ * This comparator compares each token in the expected output and returns true
+ * if all tokens are in the actual output
+ *
+ */
+public class TokenComparator extends ComparatorBase {
+
+ @Override
+ public boolean compare(String actual, String expected) {
+ boolean compareOutput = true;
+
+ StringTokenizer tokenizer = new StringTokenizer(expected, ",\n\r");
+
+ while (tokenizer.hasMoreTokens()) {
+ String token = tokenizer.nextToken();
+ if (actual.indexOf(token) != -1) {
+ compareOutput &= true;
+ } else {
+ compareOutput &= false;
+ }
+ }
+
+ return compareOutput;
+ }
+}
diff --git a/src/test/org/apache/hadoop/conf/TestConfiguration.java b/src/test/org/apache/hadoop/conf/TestConfiguration.java
new file mode 100644
index 00000000000..e509fd34641
--- /dev/null
+++ b/src/test/org/apache/hadoop/conf/TestConfiguration.java
@@ -0,0 +1,392 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.conf;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.DataInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.DataOutputStream;
+import java.util.ArrayList;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.fs.Path;
+
+
+public class TestConfiguration extends TestCase {
+
+ private Configuration conf;
+ final static String CONFIG = new File("./test-config.xml").getAbsolutePath();
+ final static String CONFIG2 = new File("./test-config2.xml").getAbsolutePath();
+ final static Random RAN = new Random();
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ conf = new Configuration();
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ super.tearDown();
+ new File(CONFIG).delete();
+ new File(CONFIG2).delete();
+ }
+
+ private void startConfig() throws IOException{
+ out.write("\n");
+ out.write("\n");
+ }
+
+ private void endConfig() throws IOException{
+ out.write("\n");
+ out.close();
+ }
+
+ private void addInclude(String filename) throws IOException{
+ out.write("\n ");
+ }
+
+ public void testVariableSubstitution() throws IOException {
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ declareProperty("my.int", "${intvar}", "42");
+ declareProperty("intvar", "42", "42");
+ declareProperty("my.base", "/tmp/${user.name}", UNSPEC);
+ declareProperty("my.file", "hello", "hello");
+ declareProperty("my.suffix", ".txt", ".txt");
+ declareProperty("my.relfile", "${my.file}${my.suffix}", "hello.txt");
+ declareProperty("my.fullfile", "${my.base}/${my.file}${my.suffix}", UNSPEC);
+ // check that undefined variables are returned as-is
+ declareProperty("my.failsexpand", "a${my.undefvar}b", "a${my.undefvar}b");
+ endConfig();
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+
+ for (Prop p : props) {
+ System.out.println("p=" + p.name);
+ String gotVal = conf.get(p.name);
+ String gotRawVal = conf.getRaw(p.name);
+ assertEq(p.val, gotRawVal);
+ if (p.expectEval == UNSPEC) {
+ // expansion is system-dependent (uses System properties)
+ // can't do exact match so just check that all variables got expanded
+ assertTrue(gotVal != null && -1 == gotVal.indexOf("${"));
+ } else {
+ assertEq(p.expectEval, gotVal);
+ }
+ }
+
+ // check that expansion also occurs for getInt()
+ assertTrue(conf.getInt("intvar", -1) == 42);
+ assertTrue(conf.getInt("my.int", -1) == 42);
+ }
+
+ public static void assertEq(Object a, Object b) {
+ System.out.println("assertEq: " + a + ", " + b);
+ assertEquals(a, b);
+ }
+
+ static class Prop {
+ String name;
+ String val;
+ String expectEval;
+ }
+
+ final String UNSPEC = null;
+ ArrayList props = new ArrayList();
+
+ void declareProperty(String name, String val, String expectEval)
+ throws IOException {
+ declareProperty(name, val, expectEval, false);
+ }
+
+ void declareProperty(String name, String val, String expectEval,
+ boolean isFinal)
+ throws IOException {
+ appendProperty(name, val, isFinal);
+ Prop p = new Prop();
+ p.name = name;
+ p.val = val;
+ p.expectEval = expectEval;
+ props.add(p);
+ }
+
+ void appendProperty(String name, String val) throws IOException {
+ appendProperty(name, val, false);
+ }
+
+ void appendProperty(String name, String val, boolean isFinal)
+ throws IOException {
+ out.write("");
+ out.write("");
+ out.write(name);
+ out.write("");
+ out.write("");
+ out.write(val);
+ out.write("");
+ if (isFinal) {
+ out.write("true");
+ }
+ out.write("\n");
+ }
+
+ public void testOverlay() throws IOException{
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ appendProperty("a","b");
+ appendProperty("b","c");
+ appendProperty("d","e");
+ appendProperty("e","f", true);
+ endConfig();
+
+ out=new BufferedWriter(new FileWriter(CONFIG2));
+ startConfig();
+ appendProperty("a","b");
+ appendProperty("b","d");
+ appendProperty("e","e");
+ endConfig();
+
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+
+ //set dynamically something
+ conf.set("c","d");
+ conf.set("a","d");
+
+ Configuration clone=new Configuration(conf);
+ clone.addResource(new Path(CONFIG2));
+
+ assertEquals(clone.get("a"), "d");
+ assertEquals(clone.get("b"), "d");
+ assertEquals(clone.get("c"), "d");
+ assertEquals(clone.get("d"), "e");
+ assertEquals(clone.get("e"), "f");
+
+ }
+
+ public void testCommentsInValue() throws IOException {
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ appendProperty("my.comment", "this contains a comment");
+ endConfig();
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+ //two spaces one after "this", one before "contains"
+ assertEquals("this contains a comment", conf.get("my.comment"));
+ }
+
+ public void testTrim() throws IOException {
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ String[] whitespaces = {"", " ", "\n", "\t"};
+ String[] name = new String[100];
+ for(int i = 0; i < name.length; i++) {
+ name[i] = "foo" + i;
+ StringBuilder prefix = new StringBuilder();
+ StringBuilder postfix = new StringBuilder();
+ for(int j = 0; j < 3; j++) {
+ prefix.append(whitespaces[RAN.nextInt(whitespaces.length)]);
+ postfix.append(whitespaces[RAN.nextInt(whitespaces.length)]);
+ }
+
+ appendProperty(prefix + name[i] + postfix, name[i] + ".value");
+ }
+ endConfig();
+
+ conf.addResource(new Path(CONFIG));
+ for(String n : name) {
+ assertEquals(n + ".value", conf.get(n));
+ }
+ }
+
+ public void testToString() throws IOException {
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ endConfig();
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+
+ String expectedOutput =
+ "Configuration: core-default.xml, core-site.xml, " +
+ fileResource.toString();
+ assertEquals(expectedOutput, conf.toString());
+ }
+
+ public void testIncludes() throws Exception {
+ tearDown();
+ System.out.println("XXX testIncludes");
+ out=new BufferedWriter(new FileWriter(CONFIG2));
+ startConfig();
+ appendProperty("a","b");
+ appendProperty("c","d");
+ endConfig();
+
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ addInclude(CONFIG2);
+ appendProperty("e","f");
+ appendProperty("g","h");
+ endConfig();
+
+ // verify that the includes file contains all properties
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+ assertEquals(conf.get("a"), "b");
+ assertEquals(conf.get("c"), "d");
+ assertEquals(conf.get("e"), "f");
+ assertEquals(conf.get("g"), "h");
+ tearDown();
+ }
+
+ BufferedWriter out;
+
+ public void testIntegerRanges() {
+ Configuration conf = new Configuration();
+ conf.set("first", "-100");
+ conf.set("second", "4-6,9-10,27");
+ conf.set("third", "34-");
+ Configuration.IntegerRanges range = conf.getRange("first", null);
+ System.out.println("first = " + range);
+ assertEquals(true, range.isIncluded(0));
+ assertEquals(true, range.isIncluded(1));
+ assertEquals(true, range.isIncluded(100));
+ assertEquals(false, range.isIncluded(101));
+ range = conf.getRange("second", null);
+ System.out.println("second = " + range);
+ assertEquals(false, range.isIncluded(3));
+ assertEquals(true, range.isIncluded(4));
+ assertEquals(true, range.isIncluded(6));
+ assertEquals(false, range.isIncluded(7));
+ assertEquals(false, range.isIncluded(8));
+ assertEquals(true, range.isIncluded(9));
+ assertEquals(true, range.isIncluded(10));
+ assertEquals(false, range.isIncluded(11));
+ assertEquals(false, range.isIncluded(26));
+ assertEquals(true, range.isIncluded(27));
+ assertEquals(false, range.isIncluded(28));
+ range = conf.getRange("third", null);
+ System.out.println("third = " + range);
+ assertEquals(false, range.isIncluded(33));
+ assertEquals(true, range.isIncluded(34));
+ assertEquals(true, range.isIncluded(100000000));
+ }
+
+ public void testHexValues() throws IOException{
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ appendProperty("test.hex1", "0x10");
+ appendProperty("test.hex2", "0xF");
+ appendProperty("test.hex3", "-0x10");
+ endConfig();
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+ assertEquals(16, conf.getInt("test.hex1", 0));
+ assertEquals(16, conf.getLong("test.hex1", 0));
+ assertEquals(15, conf.getInt("test.hex2", 0));
+ assertEquals(15, conf.getLong("test.hex2", 0));
+ assertEquals(-16, conf.getInt("test.hex3", 0));
+ assertEquals(-16, conf.getLong("test.hex3", 0));
+
+ }
+
+ public void testIntegerValues() throws IOException{
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ appendProperty("test.int1", "20");
+ appendProperty("test.int2", "020");
+ appendProperty("test.int3", "-20");
+ endConfig();
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+ assertEquals(20, conf.getInt("test.int1", 0));
+ assertEquals(20, conf.getLong("test.int1", 0));
+ assertEquals(20, conf.getInt("test.int2", 0));
+ assertEquals(20, conf.getLong("test.int2", 0));
+ assertEquals(-20, conf.getInt("test.int3", 0));
+ assertEquals(-20, conf.getLong("test.int3", 0));
+ }
+
+ public void testReload() throws IOException {
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ appendProperty("test.key1", "final-value1", true);
+ appendProperty("test.key2", "value2");
+ endConfig();
+ Path fileResource = new Path(CONFIG);
+ conf.addResource(fileResource);
+
+ out=new BufferedWriter(new FileWriter(CONFIG2));
+ startConfig();
+ appendProperty("test.key1", "value1");
+ appendProperty("test.key3", "value3");
+ endConfig();
+ Path fileResource1 = new Path(CONFIG2);
+ conf.addResource(fileResource1);
+
+ // add a few values via set.
+ conf.set("test.key3", "value4");
+ conf.set("test.key4", "value5");
+
+ assertEquals("final-value1", conf.get("test.key1"));
+ assertEquals("value2", conf.get("test.key2"));
+ assertEquals("value4", conf.get("test.key3"));
+ assertEquals("value5", conf.get("test.key4"));
+
+ // change values in the test file...
+ out=new BufferedWriter(new FileWriter(CONFIG));
+ startConfig();
+ appendProperty("test.key1", "final-value1");
+ appendProperty("test.key3", "final-value3", true);
+ endConfig();
+
+ conf.reloadConfiguration();
+ assertEquals("value1", conf.get("test.key1"));
+ // overlayed property overrides.
+ assertEquals("value4", conf.get("test.key3"));
+ assertEquals(null, conf.get("test.key2"));
+ assertEquals("value5", conf.get("test.key4"));
+ }
+
+ public void testSize() throws IOException {
+ Configuration conf = new Configuration(false);
+ conf.set("a", "A");
+ conf.set("b", "B");
+ assertEquals(2, conf.size());
+ }
+
+ public void testClear() throws IOException {
+ Configuration conf = new Configuration(false);
+ conf.set("a", "A");
+ conf.set("b", "B");
+ conf.clear();
+ assertEquals(0, conf.size());
+ assertFalse(conf.iterator().hasNext());
+ }
+
+ public static void main(String[] argv) throws Exception {
+ junit.textui.TestRunner.main(new String[]{
+ TestConfiguration.class.getName()
+ });
+ }
+}
diff --git a/src/test/org/apache/hadoop/conf/TestConfigurationSubclass.java b/src/test/org/apache/hadoop/conf/TestConfigurationSubclass.java
new file mode 100644
index 00000000000..fd2fa38967e
--- /dev/null
+++ b/src/test/org/apache/hadoop/conf/TestConfigurationSubclass.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.conf;
+
+import junit.framework.TestCase;
+
+import java.util.Properties;
+
+/**
+ * Created 21-Jan-2009 13:42:36
+ */
+
+public class TestConfigurationSubclass extends TestCase {
+ private static final String EMPTY_CONFIGURATION_XML
+ = "/org/apache/hadoop/conf/empty-configuration.xml";
+
+
+ public void testGetProps() {
+ SubConf conf = new SubConf(true);
+ Properties properties = conf.getProperties();
+ assertNotNull("hadoop.tmp.dir is not set",
+ properties.getProperty("hadoop.tmp.dir"));
+ }
+
+ public void testReload() throws Throwable {
+ SubConf conf = new SubConf(true);
+ assertFalse(conf.isReloaded());
+ Configuration.addDefaultResource(EMPTY_CONFIGURATION_XML);
+ assertTrue(conf.isReloaded());
+ Properties properties = conf.getProperties();
+ }
+
+ public void testReloadNotQuiet() throws Throwable {
+ SubConf conf = new SubConf(true);
+ conf.setQuietMode(false);
+ assertFalse(conf.isReloaded());
+ conf.addResource("not-a-valid-resource");
+ assertTrue(conf.isReloaded());
+ try {
+ Properties properties = conf.getProperties();
+ fail("Should not have got here");
+ } catch (RuntimeException e) {
+ assertTrue(e.toString(),e.getMessage().contains("not found"));
+ }
+ }
+
+ private static class SubConf extends Configuration {
+
+ private boolean reloaded;
+
+ /**
+ * A new configuration where the behavior of reading from the default resources
+ * can be turned off.
+ *
+ * If the parameter {@code loadDefaults} is false, the new instance will not
+ * load resources from the default files.
+ *
+ * @param loadDefaults specifies whether to load from the default files
+ */
+ private SubConf(boolean loadDefaults) {
+ super(loadDefaults);
+ }
+
+ public Properties getProperties() {
+ return super.getProps();
+ }
+
+ /**
+ * {@inheritDoc}.
+ * Sets the reloaded flag.
+ */
+ @Override
+ public void reloadConfiguration() {
+ super.reloadConfiguration();
+ reloaded = true;
+ }
+
+ public boolean isReloaded() {
+ return reloaded;
+ }
+
+ public void setReloaded(boolean reloaded) {
+ this.reloaded = reloaded;
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/conf/TestGetInstances.java b/src/test/org/apache/hadoop/conf/TestGetInstances.java
new file mode 100644
index 00000000000..57b7ff45198
--- /dev/null
+++ b/src/test/org/apache/hadoop/conf/TestGetInstances.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.conf;
+
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class TestGetInstances extends TestCase {
+
+ interface SampleInterface {}
+
+ interface ChildInterface extends SampleInterface {}
+
+ static class SampleClass implements SampleInterface {
+ SampleClass() {}
+ }
+
+ static class AnotherClass implements ChildInterface {
+ AnotherClass() {}
+ }
+
+ /**
+ * Makes sure Configuration.getInstances() returns
+ * instances of the required type.
+ */
+ public void testGetInstances() throws Exception {
+ Configuration conf = new Configuration();
+
+ List classes =
+ conf.getInstances("no.such.property", SampleInterface.class);
+ assertTrue(classes.isEmpty());
+
+ conf.set("empty.property", "");
+ classes = conf.getInstances("empty.property", SampleInterface.class);
+ assertTrue(classes.isEmpty());
+
+ conf.setStrings("some.classes",
+ SampleClass.class.getName(), AnotherClass.class.getName());
+ classes = conf.getInstances("some.classes", SampleInterface.class);
+ assertEquals(2, classes.size());
+
+ try {
+ conf.setStrings("some.classes",
+ SampleClass.class.getName(), AnotherClass.class.getName(),
+ String.class.getName());
+ conf.getInstances("some.classes", SampleInterface.class);
+ fail("java.lang.String does not implement SampleInterface");
+ } catch (RuntimeException e) {}
+
+ try {
+ conf.setStrings("some.classes",
+ SampleClass.class.getName(), AnotherClass.class.getName(),
+ "no.such.Class");
+ conf.getInstances("some.classes", SampleInterface.class);
+ fail("no.such.Class does not exist");
+ } catch (RuntimeException e) {}
+ }
+}
diff --git a/src/test/org/apache/hadoop/conf/empty-configuration.xml b/src/test/org/apache/hadoop/conf/empty-configuration.xml
new file mode 100644
index 00000000000..a2086fa683f
--- /dev/null
+++ b/src/test/org/apache/hadoop/conf/empty-configuration.xml
@@ -0,0 +1,4 @@
+
+
+
+
diff --git a/src/test/org/apache/hadoop/filecache/TestDistributedCache.java b/src/test/org/apache/hadoop/filecache/TestDistributedCache.java
new file mode 100644
index 00000000000..2da7f0bc145
--- /dev/null
+++ b/src/test/org/apache/hadoop/filecache/TestDistributedCache.java
@@ -0,0 +1,77 @@
+package org.apache.hadoop.filecache;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import junit.framework.TestCase;
+
+public class TestDistributedCache extends TestCase {
+
+ static final URI LOCAL_FS = URI.create("file:///");
+ private static String TEST_CACHE_BASE_DIR =
+ new Path(System.getProperty("test.build.data","/tmp/cachebasedir"))
+ .toString().replace(' ', '+');
+ private static String TEST_ROOT_DIR =
+ System.getProperty("test.build.data", "/tmp/distributedcache");
+ private static final int TEST_FILE_SIZE = 4 * 1024; // 4K
+ private static final int LOCAL_CACHE_LIMIT = 5 * 1024; //5K
+ private Configuration conf;
+ private Path firstCacheFile;
+ private Path secondCacheFile;
+ private FileSystem localfs;
+
+ /**
+ * @see TestCase#setUp()
+ */
+ @Override
+ protected void setUp() throws IOException {
+ conf = new Configuration();
+ conf.setLong("local.cache.size", LOCAL_CACHE_LIMIT);
+ localfs = FileSystem.get(LOCAL_FS, conf);
+ firstCacheFile = new Path(TEST_ROOT_DIR+"/firstcachefile");
+ secondCacheFile = new Path(TEST_ROOT_DIR+"/secondcachefile");
+ createTempFile(localfs, firstCacheFile);
+ createTempFile(localfs, secondCacheFile);
+ }
+
+ /** test delete cache */
+ public void testDeleteCache() throws Exception {
+ DistributedCache.getLocalCache(firstCacheFile.toUri(), conf, new Path(TEST_CACHE_BASE_DIR),
+ false, System.currentTimeMillis(), new Path(TEST_ROOT_DIR));
+ DistributedCache.releaseCache(firstCacheFile.toUri(), conf);
+ //in above code,localized a file of size 4K and then release the cache which will cause the cache
+ //be deleted when the limit goes out. The below code localize another cache which's designed to
+ //sweep away the first cache.
+ DistributedCache.getLocalCache(secondCacheFile.toUri(), conf, new Path(TEST_CACHE_BASE_DIR),
+ false, System.currentTimeMillis(), new Path(TEST_ROOT_DIR));
+ FileStatus[] dirStatuses = localfs.listStatus(new Path(TEST_CACHE_BASE_DIR));
+ assertTrue("DistributedCache failed deleting old cache when the cache store is full.",
+ dirStatuses.length > 1);
+ }
+
+ private void createTempFile(FileSystem fs, Path p) throws IOException {
+ FSDataOutputStream out = fs.create(p);
+ byte[] toWrite = new byte[TEST_FILE_SIZE];
+ new Random().nextBytes(toWrite);
+ out.write(toWrite);
+ out.close();
+ FileSystem.LOG.info("created: " + p + ", size=" + TEST_FILE_SIZE);
+ }
+
+ /**
+ * @see TestCase#tearDown()
+ */
+ @Override
+ protected void tearDown() throws IOException {
+ localfs.delete(firstCacheFile, true);
+ localfs.delete(secondCacheFile, true);
+ localfs.close();
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/FileSystemContractBaseTest.java b/src/test/org/apache/hadoop/fs/FileSystemContractBaseTest.java
new file mode 100644
index 00000000000..8bdeb3bfd7d
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/FileSystemContractBaseTest.java
@@ -0,0 +1,471 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ *
+ * A collection of tests for the contract of the {@link FileSystem}.
+ * This test should be used for general-purpose implementations of
+ * {@link FileSystem}, that is, implementations that provide implementations
+ * of all of the functionality of {@link FileSystem}.
+ *
+ *
+ * To test a given {@link FileSystem} implementation create a subclass of this
+ * test and override {@link #setUp()} to initialize the fs
+ * {@link FileSystem} instance variable.
+ *
+ */
+public abstract class FileSystemContractBaseTest extends TestCase {
+
+ protected FileSystem fs;
+ private byte[] data = new byte[getBlockSize() * 2]; // two blocks of data
+ {
+ for (int i = 0; i < data.length; i++) {
+ data[i] = (byte) (i % 10);
+ }
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ fs.delete(path("/test"), true);
+ }
+
+ protected int getBlockSize() {
+ return 1024;
+ }
+
+ protected String getDefaultWorkingDirectory() {
+ return "/user/" + System.getProperty("user.name");
+ }
+
+ protected boolean renameSupported() {
+ return true;
+ }
+
+ public void testFsStatus() throws Exception {
+ FsStatus fsStatus = fs.getStatus();
+ assertNotNull(fsStatus);
+ //used, free and capacity are non-negative longs
+ assertTrue(fsStatus.getUsed() >= 0);
+ assertTrue(fsStatus.getRemaining() >= 0);
+ assertTrue(fsStatus.getCapacity() >= 0);
+ }
+
+ public void testWorkingDirectory() throws Exception {
+
+ Path workDir = path(getDefaultWorkingDirectory());
+ assertEquals(workDir, fs.getWorkingDirectory());
+
+ fs.setWorkingDirectory(path("."));
+ assertEquals(workDir, fs.getWorkingDirectory());
+
+ fs.setWorkingDirectory(path(".."));
+ assertEquals(workDir.getParent(), fs.getWorkingDirectory());
+
+ Path relativeDir = path("hadoop");
+ fs.setWorkingDirectory(relativeDir);
+ assertEquals(relativeDir, fs.getWorkingDirectory());
+
+ Path absoluteDir = path("/test/hadoop");
+ fs.setWorkingDirectory(absoluteDir);
+ assertEquals(absoluteDir, fs.getWorkingDirectory());
+
+ }
+
+ public void testMkdirs() throws Exception {
+ Path testDir = path("/test/hadoop");
+ assertFalse(fs.exists(testDir));
+ assertFalse(fs.isFile(testDir));
+
+ assertTrue(fs.mkdirs(testDir));
+
+ assertTrue(fs.exists(testDir));
+ assertFalse(fs.isFile(testDir));
+
+ assertTrue(fs.mkdirs(testDir));
+
+ assertTrue(fs.exists(testDir));
+ assertFalse(fs.isFile(testDir));
+
+ Path parentDir = testDir.getParent();
+ assertTrue(fs.exists(parentDir));
+ assertFalse(fs.isFile(parentDir));
+
+ Path grandparentDir = parentDir.getParent();
+ assertTrue(fs.exists(grandparentDir));
+ assertFalse(fs.isFile(grandparentDir));
+
+ }
+
+ public void testMkdirsFailsForSubdirectoryOfExistingFile() throws Exception {
+ Path testDir = path("/test/hadoop");
+ assertFalse(fs.exists(testDir));
+ assertTrue(fs.mkdirs(testDir));
+ assertTrue(fs.exists(testDir));
+
+ createFile(path("/test/hadoop/file"));
+
+ Path testSubDir = path("/test/hadoop/file/subdir");
+ try {
+ fs.mkdirs(testSubDir);
+ fail("Should throw IOException.");
+ } catch (IOException e) {
+ // expected
+ }
+ assertFalse(fs.exists(testSubDir));
+
+ Path testDeepSubDir = path("/test/hadoop/file/deep/sub/dir");
+ try {
+ fs.mkdirs(testDeepSubDir);
+ fail("Should throw IOException.");
+ } catch (IOException e) {
+ // expected
+ }
+ assertFalse(fs.exists(testDeepSubDir));
+
+ }
+
+ public void testGetFileStatusThrowsExceptionForNonExistentFile()
+ throws Exception {
+ try {
+ fs.getFileStatus(path("/test/hadoop/file"));
+ fail("Should throw FileNotFoundException");
+ } catch (FileNotFoundException e) {
+ // expected
+ }
+ }
+
+ public void testListStatusReturnsNullForNonExistentFile() throws Exception {
+ assertNull(fs.listStatus(path("/test/hadoop/file")));
+ }
+
+ public void testListStatus() throws Exception {
+ Path[] testDirs = { path("/test/hadoop/a"),
+ path("/test/hadoop/b"),
+ path("/test/hadoop/c/1"), };
+ assertFalse(fs.exists(testDirs[0]));
+
+ for (Path path : testDirs) {
+ assertTrue(fs.mkdirs(path));
+ }
+
+ FileStatus[] paths = fs.listStatus(path("/test"));
+ assertEquals(1, paths.length);
+ assertEquals(path("/test/hadoop"), paths[0].getPath());
+
+ paths = fs.listStatus(path("/test/hadoop"));
+ assertEquals(3, paths.length);
+ assertEquals(path("/test/hadoop/a"), paths[0].getPath());
+ assertEquals(path("/test/hadoop/b"), paths[1].getPath());
+ assertEquals(path("/test/hadoop/c"), paths[2].getPath());
+
+ paths = fs.listStatus(path("/test/hadoop/a"));
+ assertEquals(0, paths.length);
+ }
+
+ public void testWriteReadAndDeleteEmptyFile() throws Exception {
+ writeReadAndDelete(0);
+ }
+
+ public void testWriteReadAndDeleteHalfABlock() throws Exception {
+ writeReadAndDelete(getBlockSize() / 2);
+ }
+
+ public void testWriteReadAndDeleteOneBlock() throws Exception {
+ writeReadAndDelete(getBlockSize());
+ }
+
+ public void testWriteReadAndDeleteOneAndAHalfBlocks() throws Exception {
+ writeReadAndDelete(getBlockSize() + (getBlockSize() / 2));
+ }
+
+ public void testWriteReadAndDeleteTwoBlocks() throws Exception {
+ writeReadAndDelete(getBlockSize() * 2);
+ }
+
+ private void writeReadAndDelete(int len) throws IOException {
+ Path path = path("/test/hadoop/file");
+
+ fs.mkdirs(path.getParent());
+
+ FSDataOutputStream out = fs.create(path, false,
+ fs.getConf().getInt("io.file.buffer.size", 4096),
+ (short) 1, getBlockSize());
+ out.write(data, 0, len);
+ out.close();
+
+ assertTrue("Exists", fs.exists(path));
+ assertEquals("Length", len, fs.getFileStatus(path).getLen());
+
+ FSDataInputStream in = fs.open(path);
+ byte[] buf = new byte[len];
+ in.readFully(0, buf);
+ in.close();
+
+ assertEquals(len, buf.length);
+ for (int i = 0; i < buf.length; i++) {
+ assertEquals("Position " + i, data[i], buf[i]);
+ }
+
+ assertTrue("Deleted", fs.delete(path, false));
+
+ assertFalse("No longer exists", fs.exists(path));
+
+ }
+
+ public void testOverwrite() throws IOException {
+ Path path = path("/test/hadoop/file");
+
+ fs.mkdirs(path.getParent());
+
+ createFile(path);
+
+ assertTrue("Exists", fs.exists(path));
+ assertEquals("Length", data.length, fs.getFileStatus(path).getLen());
+
+ try {
+ fs.create(path, false);
+ fail("Should throw IOException.");
+ } catch (IOException e) {
+ // Expected
+ }
+
+ FSDataOutputStream out = fs.create(path, true);
+ out.write(data, 0, data.length);
+ out.close();
+
+ assertTrue("Exists", fs.exists(path));
+ assertEquals("Length", data.length, fs.getFileStatus(path).getLen());
+
+ }
+
+ public void testWriteInNonExistentDirectory() throws IOException {
+ Path path = path("/test/hadoop/file");
+ assertFalse("Parent doesn't exist", fs.exists(path.getParent()));
+ createFile(path);
+
+ assertTrue("Exists", fs.exists(path));
+ assertEquals("Length", data.length, fs.getFileStatus(path).getLen());
+ assertTrue("Parent exists", fs.exists(path.getParent()));
+ }
+
+ public void testDeleteNonExistentFile() throws IOException {
+ Path path = path("/test/hadoop/file");
+ assertFalse("Doesn't exist", fs.exists(path));
+ assertFalse("No deletion", fs.delete(path, true));
+ }
+
+ public void testDeleteRecursively() throws IOException {
+ Path dir = path("/test/hadoop");
+ Path file = path("/test/hadoop/file");
+ Path subdir = path("/test/hadoop/subdir");
+
+ createFile(file);
+ assertTrue("Created subdir", fs.mkdirs(subdir));
+
+ assertTrue("File exists", fs.exists(file));
+ assertTrue("Dir exists", fs.exists(dir));
+ assertTrue("Subdir exists", fs.exists(subdir));
+
+ try {
+ fs.delete(dir, false);
+ fail("Should throw IOException.");
+ } catch (IOException e) {
+ // expected
+ }
+ assertTrue("File still exists", fs.exists(file));
+ assertTrue("Dir still exists", fs.exists(dir));
+ assertTrue("Subdir still exists", fs.exists(subdir));
+
+ assertTrue("Deleted", fs.delete(dir, true));
+ assertFalse("File doesn't exist", fs.exists(file));
+ assertFalse("Dir doesn't exist", fs.exists(dir));
+ assertFalse("Subdir doesn't exist", fs.exists(subdir));
+ }
+
+ public void testDeleteEmptyDirectory() throws IOException {
+ Path dir = path("/test/hadoop");
+ assertTrue(fs.mkdirs(dir));
+ assertTrue("Dir exists", fs.exists(dir));
+ assertTrue("Deleted", fs.delete(dir, false));
+ assertFalse("Dir doesn't exist", fs.exists(dir));
+ }
+
+ public void testRenameNonExistentPath() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/path");
+ Path dst = path("/test/new/newpath");
+ rename(src, dst, false, false, false);
+ }
+
+ public void testRenameFileMoveToNonExistentDirectory() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/file");
+ createFile(src);
+ Path dst = path("/test/new/newfile");
+ rename(src, dst, false, true, false);
+ }
+
+ public void testRenameFileMoveToExistingDirectory() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/file");
+ createFile(src);
+ Path dst = path("/test/new/newfile");
+ fs.mkdirs(dst.getParent());
+ rename(src, dst, true, false, true);
+ }
+
+ public void testRenameFileAsExistingFile() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/file");
+ createFile(src);
+ Path dst = path("/test/new/newfile");
+ createFile(dst);
+ rename(src, dst, false, true, true);
+ }
+
+ public void testRenameFileAsExistingDirectory() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/file");
+ createFile(src);
+ Path dst = path("/test/new/newdir");
+ fs.mkdirs(dst);
+ rename(src, dst, true, false, true);
+ assertTrue("Destination changed",
+ fs.exists(path("/test/new/newdir/file")));
+ }
+
+ public void testRenameDirectoryMoveToNonExistentDirectory()
+ throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/dir");
+ fs.mkdirs(src);
+ Path dst = path("/test/new/newdir");
+ rename(src, dst, false, true, false);
+ }
+
+ public void testRenameDirectoryMoveToExistingDirectory() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/dir");
+ fs.mkdirs(src);
+ createFile(path("/test/hadoop/dir/file1"));
+ createFile(path("/test/hadoop/dir/subdir/file2"));
+
+ Path dst = path("/test/new/newdir");
+ fs.mkdirs(dst.getParent());
+ rename(src, dst, true, false, true);
+
+ assertFalse("Nested file1 exists",
+ fs.exists(path("/test/hadoop/dir/file1")));
+ assertFalse("Nested file2 exists",
+ fs.exists(path("/test/hadoop/dir/subdir/file2")));
+ assertTrue("Renamed nested file1 exists",
+ fs.exists(path("/test/new/newdir/file1")));
+ assertTrue("Renamed nested exists",
+ fs.exists(path("/test/new/newdir/subdir/file2")));
+ }
+
+ public void testRenameDirectoryAsExistingFile() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/dir");
+ fs.mkdirs(src);
+ Path dst = path("/test/new/newfile");
+ createFile(dst);
+ rename(src, dst, false, true, true);
+ }
+
+ public void testRenameDirectoryAsExistingDirectory() throws Exception {
+ if (!renameSupported()) return;
+
+ Path src = path("/test/hadoop/dir");
+ fs.mkdirs(src);
+ createFile(path("/test/hadoop/dir/file1"));
+ createFile(path("/test/hadoop/dir/subdir/file2"));
+
+ Path dst = path("/test/new/newdir");
+ fs.mkdirs(dst);
+ rename(src, dst, true, false, true);
+ assertTrue("Destination changed",
+ fs.exists(path("/test/new/newdir/dir")));
+ assertFalse("Nested file1 exists",
+ fs.exists(path("/test/hadoop/dir/file1")));
+ assertFalse("Nested file2 exists",
+ fs.exists(path("/test/hadoop/dir/subdir/file2")));
+ assertTrue("Renamed nested file1 exists",
+ fs.exists(path("/test/new/newdir/dir/file1")));
+ assertTrue("Renamed nested exists",
+ fs.exists(path("/test/new/newdir/dir/subdir/file2")));
+ }
+
+ public void testInputStreamClosedTwice() throws IOException {
+ //HADOOP-4760 according to Closeable#close() closing already-closed
+ //streams should have no effect.
+ Path src = path("/test/hadoop/file");
+ createFile(src);
+ FSDataInputStream in = fs.open(src);
+ in.close();
+ in.close();
+ }
+
+ public void testOutputStreamClosedTwice() throws IOException {
+ //HADOOP-4760 according to Closeable#close() closing already-closed
+ //streams should have no effect.
+ Path src = path("/test/hadoop/file");
+ FSDataOutputStream out = fs.create(src);
+ out.writeChar('H'); //write some data
+ out.close();
+ out.close();
+ }
+
+ protected Path path(String pathString) {
+ return new Path(pathString).makeQualified(fs);
+ }
+
+ protected void createFile(Path path) throws IOException {
+ FSDataOutputStream out = fs.create(path);
+ out.write(data, 0, data.length);
+ out.close();
+ }
+
+ private void rename(Path src, Path dst, boolean renameSucceeded,
+ boolean srcExists, boolean dstExists) throws IOException {
+ assertEquals("Rename result", renameSucceeded, fs.rename(src, dst));
+ assertEquals("Source exists", srcExists, fs.exists(src));
+ assertEquals("Destination exists", dstExists, fs.exists(dst));
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestChecksumFileSystem.java b/src/test/org/apache/hadoop/fs/TestChecksumFileSystem.java
new file mode 100644
index 00000000000..c55fc3ae414
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestChecksumFileSystem.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs;
+
+import java.net.URI;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.conf.Configuration;
+import junit.framework.TestCase;
+
+public class TestChecksumFileSystem extends TestCase {
+ public void testgetChecksumLength() throws Exception {
+ assertEquals(8, ChecksumFileSystem.getChecksumLength(0L, 512));
+ assertEquals(12, ChecksumFileSystem.getChecksumLength(1L, 512));
+ assertEquals(12, ChecksumFileSystem.getChecksumLength(512L, 512));
+ assertEquals(16, ChecksumFileSystem.getChecksumLength(513L, 512));
+ assertEquals(16, ChecksumFileSystem.getChecksumLength(1023L, 512));
+ assertEquals(16, ChecksumFileSystem.getChecksumLength(1024L, 512));
+ assertEquals(408, ChecksumFileSystem.getChecksumLength(100L, 1));
+ assertEquals(4000000000008L,
+ ChecksumFileSystem.getChecksumLength(10000000000000L, 10));
+ }
+
+ public void testVerifyChecksum() throws Exception {
+ String TEST_ROOT_DIR
+ = System.getProperty("test.build.data","build/test/data/work-dir/localfs");
+
+ Configuration conf = new Configuration();
+ LocalFileSystem localFs = FileSystem.getLocal(conf);
+ Path testPath = new Path(TEST_ROOT_DIR, "testPath");
+ Path testPath11 = new Path(TEST_ROOT_DIR, "testPath11");
+ FSDataOutputStream fout = localFs.create(testPath);
+ fout.write("testing".getBytes());
+ fout.close();
+
+ fout = localFs.create(testPath11);
+ fout.write("testing you".getBytes());
+ fout.close();
+
+ localFs.delete(localFs.getChecksumFile(testPath), true);
+ assertTrue("checksum deleted", !localFs.exists(localFs.getChecksumFile(testPath)));
+
+ //copying the wrong checksum file
+ FileUtil.copy(localFs, localFs.getChecksumFile(testPath11), localFs,
+ localFs.getChecksumFile(testPath),false,true,conf);
+ assertTrue("checksum exists", localFs.exists(localFs.getChecksumFile(testPath)));
+
+ boolean errorRead = false;
+ try {
+ TestLocalFileSystem.readFile(localFs, testPath);
+ }catch(ChecksumException ie) {
+ errorRead = true;
+ }
+ assertTrue("error reading", errorRead);
+
+ //now setting verify false, the read should succeed
+ localFs.setVerifyChecksum(false);
+ String str = TestLocalFileSystem.readFile(localFs, testPath);
+ assertTrue("read", "testing".equals(str));
+
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestDFVariations.java b/src/test/org/apache/hadoop/fs/TestDFVariations.java
new file mode 100644
index 00000000000..3999050069b
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestDFVariations.java
@@ -0,0 +1,63 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.hadoop.fs;
+
+import junit.framework.TestCase;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.EnumSet;
+
+public class TestDFVariations extends TestCase {
+
+ public static class XXDF extends DF {
+ private final String osName;
+ public XXDF(String osName) throws IOException {
+ super(new File(System.getProperty("test.build.data","/tmp")), 0L);
+ this.osName = osName;
+ }
+ @Override
+ public DF.OSType getOSType() {
+ return DF.getOSType(osName);
+ }
+ @Override
+ protected String[] getExecString() {
+ switch(getOSType()) {
+ case OS_TYPE_AIX:
+ return new String[] { "echo", "IGNORE\n", "/dev/sda3",
+ "453115160", "400077240", "11%", "18", "skip%", "/foo/bar", "\n" };
+ default:
+ return new String[] { "echo", "IGNORE\n", "/dev/sda3",
+ "453115160", "53037920", "400077240", "11%", "/foo/bar", "\n" };
+ }
+ }
+ }
+
+ public void testOSParsing() throws Exception {
+ for (DF.OSType ost : EnumSet.allOf(DF.OSType.class)) {
+ XXDF df = new XXDF(ost.getId());
+ assertEquals(ost.getId() + " total", 453115160 * 1024L, df.getCapacity());
+ assertEquals(ost.getId() + " used", 53037920 * 1024L, df.getUsed());
+ assertEquals(ost.getId() + " avail", 400077240 * 1024L, df.getAvailable());
+ assertEquals(ost.getId() + " pcnt used", 11, df.getPercentUsed());
+ assertEquals(ost.getId() + " mount", "/foo/bar", df.getMount());
+ }
+ }
+
+}
+
diff --git a/src/test/org/apache/hadoop/fs/TestDU.java b/src/test/org/apache/hadoop/fs/TestDU.java
new file mode 100644
index 00000000000..6df487be55f
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestDU.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import junit.framework.TestCase;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.Random;
+
+/** This test makes sure that "DU" does not get to run on each call to getUsed */
+public class TestDU extends TestCase {
+ final static private File DU_DIR = new File(
+ System.getProperty("test.build.data","/tmp"), "dutmp");
+
+ public void setUp() throws IOException {
+ FileUtil.fullyDelete(DU_DIR);
+ assertTrue(DU_DIR.mkdirs());
+ }
+
+ public void tearDown() throws IOException {
+ FileUtil.fullyDelete(DU_DIR);
+ }
+
+ private void createFile(File newFile, int size) throws IOException {
+ // write random data so that filesystems with compression enabled (e.g., ZFS)
+ // can't compress the file
+ Random random = new Random();
+ byte[] data = new byte[size];
+ random.nextBytes(data);
+
+ newFile.createNewFile();
+ RandomAccessFile file = new RandomAccessFile(newFile, "rws");
+
+ file.write(data);
+
+ file.getFD().sync();
+ file.close();
+ }
+
+ /**
+ * Verify that du returns expected used space for a file.
+ * We assume here that if a file system crates a file of size
+ * that is a multiple of the block size in this file system,
+ * then the used size for the file will be exactly that size.
+ * This is true for most file systems.
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ */
+ public void testDU() throws IOException, InterruptedException {
+ int writtenSize = 32*1024; // writing 32K
+ File file = new File(DU_DIR, "data");
+ createFile(file, writtenSize);
+
+ Thread.sleep(5000); // let the metadata updater catch up
+
+ DU du = new DU(file, 10000);
+ du.start();
+ long duSize = du.getUsed();
+ du.shutdown();
+
+ assertEquals(writtenSize, duSize);
+
+ //test with 0 interval, will not launch thread
+ du = new DU(file, 0);
+ du.start();
+ duSize = du.getUsed();
+ du.shutdown();
+
+ assertEquals(writtenSize, duSize);
+
+ //test without launching thread
+ du = new DU(file, 10000);
+ duSize = du.getUsed();
+
+ assertEquals(writtenSize, duSize);
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestGetFileBlockLocations.java b/src/test/org/apache/hadoop/fs/TestGetFileBlockLocations.java
new file mode 100644
index 00000000000..c85cc988627
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestGetFileBlockLocations.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.fs;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Testing the correctness of FileSystem.getFileBlockLocations.
+ */
+public class TestGetFileBlockLocations extends TestCase {
+ private static String TEST_ROOT_DIR =
+ System.getProperty("test.build.data", "/tmp/testGetFileBlockLocations");
+ private static final int FileLength = 4 * 1024 * 1024; // 4MB
+ private Configuration conf;
+ private Path path;
+ private FileSystem fs;
+ private Random random;
+
+ /**
+ * @see TestCase#setUp()
+ */
+ @Override
+ protected void setUp() throws IOException {
+ conf = new Configuration();
+ Path rootPath = new Path(TEST_ROOT_DIR);
+ path = new Path(rootPath, "TestGetFileBlockLocations");
+ fs = rootPath.getFileSystem(conf);
+ FSDataOutputStream fsdos = fs.create(path, true);
+ byte[] buffer = new byte[1024];
+ while (fsdos.getPos() < FileLength) {
+ fsdos.write(buffer);
+ }
+ fsdos.close();
+ random = new Random(System.nanoTime());
+ }
+
+ private void oneTest(int offBegin, int offEnd, FileStatus status)
+ throws IOException {
+ if (offBegin > offEnd) {
+ int tmp = offBegin;
+ offBegin = offEnd;
+ offEnd = tmp;
+ }
+ BlockLocation[] locations =
+ fs.getFileBlockLocations(status, offBegin, offEnd - offBegin);
+ if (offBegin < status.getLen()) {
+ Arrays.sort(locations, new Comparator() {
+
+ @Override
+ public int compare(BlockLocation arg0, BlockLocation arg1) {
+ long cmprv = arg0.getOffset() - arg1.getOffset();
+ if (cmprv < 0) return -1;
+ if (cmprv > 0) return 1;
+ cmprv = arg0.getLength() - arg1.getLength();
+ if (cmprv < 0) return -1;
+ if (cmprv > 0) return 1;
+ return 0;
+ }
+
+ });
+ offBegin = (int) Math.min(offBegin, status.getLen() - 1);
+ offEnd = (int) Math.min(offEnd, status.getLen());
+ BlockLocation first = locations[0];
+ BlockLocation last = locations[locations.length - 1];
+ assertTrue(first.getOffset() <= offBegin);
+ assertTrue(offEnd <= last.getOffset() + last.getLength());
+ } else {
+ assertTrue(locations.length == 0);
+ }
+ }
+ /**
+ * @see TestCase#tearDown()
+ */
+ @Override
+ protected void tearDown() throws IOException {
+ fs.delete(path, true);
+ fs.close();
+ }
+
+ public void testFailureNegativeParameters() throws IOException {
+ FileStatus status = fs.getFileStatus(path);
+ try {
+ BlockLocation[] locations = fs.getFileBlockLocations(status, -1, 100);
+ fail("Expecting exception being throw");
+ } catch (IllegalArgumentException e) {
+
+ }
+
+ try {
+ BlockLocation[] locations = fs.getFileBlockLocations(status, 100, -1);
+ fail("Expecting exception being throw");
+ } catch (IllegalArgumentException e) {
+
+ }
+ }
+
+ public void testGetFileBlockLocations1() throws IOException {
+ FileStatus status = fs.getFileStatus(path);
+ oneTest(0, (int) status.getLen(), status);
+ oneTest(0, (int) status.getLen() * 2, status);
+ oneTest((int) status.getLen() * 2, (int) status.getLen() * 4, status);
+ oneTest((int) status.getLen() / 2, (int) status.getLen() * 3, status);
+ for (int i = 0; i < 10; ++i) {
+ oneTest((int) status.getLen() * i / 10, (int) status.getLen() * (i + 1)
+ / 10, status);
+ }
+ }
+
+ public void testGetFileBlockLocations2() throws IOException {
+ FileStatus status = fs.getFileStatus(path);
+ for (int i = 0; i < 1000; ++i) {
+ int offBegin = random.nextInt((int) (2 * status.getLen()));
+ int offEnd = random.nextInt((int) (2 * status.getLen()));
+ oneTest(offBegin, offEnd, status);
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestGlobExpander.java b/src/test/org/apache/hadoop/fs/TestGlobExpander.java
new file mode 100644
index 00000000000..b0466b80229
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestGlobExpander.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.IOException;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class TestGlobExpander extends TestCase {
+
+ public void testExpansionIsIdentical() throws IOException {
+ checkExpansionIsIdentical("");
+ checkExpansionIsIdentical("/}");
+ checkExpansionIsIdentical("/}{a,b}");
+ checkExpansionIsIdentical("{/");
+ checkExpansionIsIdentical("{a}");
+ checkExpansionIsIdentical("{a,b}/{b,c}");
+ checkExpansionIsIdentical("p\\{a/b,c/d\\}s");
+ checkExpansionIsIdentical("p{a\\/b,c\\/d}s");
+ }
+
+ public void testExpansion() throws IOException {
+ checkExpansion("{a/b}", "a/b");
+ checkExpansion("/}{a/b}", "/}a/b");
+ checkExpansion("p{a/b,c/d}s", "pa/bs", "pc/ds");
+ checkExpansion("{a/b,c/d,{e,f}}", "a/b", "c/d", "{e,f}");
+ checkExpansion("{a/b,c/d}{e,f}", "a/b{e,f}", "c/d{e,f}");
+ checkExpansion("{a,b}/{b,{c/d,e/f}}", "{a,b}/b", "{a,b}/c/d", "{a,b}/e/f");
+ checkExpansion("{a,b}/{c/\\d}", "{a,b}/c/d");
+ }
+
+ private void checkExpansionIsIdentical(String filePattern) throws IOException {
+ checkExpansion(filePattern, filePattern);
+ }
+
+ private void checkExpansion(String filePattern, String... expectedExpansions)
+ throws IOException {
+ List actualExpansions = GlobExpander.expand(filePattern);
+ assertEquals("Different number of expansions", expectedExpansions.length,
+ actualExpansions.size());
+ for (int i = 0; i < expectedExpansions.length; i++) {
+ assertEquals("Expansion of " + filePattern, expectedExpansions[i],
+ actualExpansions.get(i));
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestLocalDirAllocator.java b/src/test/org/apache/hadoop/fs/TestLocalDirAllocator.java
new file mode 100644
index 00000000000..eef90308aa9
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestLocalDirAllocator.java
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.Shell;
+
+import junit.framework.TestCase;
+
+/** This test LocalDirAllocator works correctly;
+ * Every test case uses different buffer dirs to
+ * enforce the AllocatorPerContext initialization.
+ * This test does not run on Cygwin because under Cygwin
+ * a directory can be created in a read-only directory
+ * which breaks this test.
+ */
+public class TestLocalDirAllocator extends TestCase {
+ final static private Configuration conf = new Configuration();
+ final static private String BUFFER_DIR_ROOT = "build/test/temp";
+ final static private Path BUFFER_PATH_ROOT = new Path(BUFFER_DIR_ROOT);
+ final static private File BUFFER_ROOT = new File(BUFFER_DIR_ROOT);
+ final static private String BUFFER_DIR[] = new String[] {
+ BUFFER_DIR_ROOT+"/tmp0", BUFFER_DIR_ROOT+"/tmp1", BUFFER_DIR_ROOT+"/tmp2",
+ BUFFER_DIR_ROOT+"/tmp3", BUFFER_DIR_ROOT+"/tmp4", BUFFER_DIR_ROOT+"/tmp5",
+ BUFFER_DIR_ROOT+"/tmp6"};
+ final static private Path BUFFER_PATH[] = new Path[] {
+ new Path(BUFFER_DIR[0]), new Path(BUFFER_DIR[1]), new Path(BUFFER_DIR[2]),
+ new Path(BUFFER_DIR[3]), new Path(BUFFER_DIR[4]), new Path(BUFFER_DIR[5]),
+ new Path(BUFFER_DIR[6])};
+ final static private String CONTEXT = "dfs.client.buffer.dir";
+ final static private String FILENAME = "block";
+ final static private LocalDirAllocator dirAllocator =
+ new LocalDirAllocator(CONTEXT);
+ static LocalFileSystem localFs;
+ final static private boolean isWindows =
+ System.getProperty("os.name").startsWith("Windows");
+ final static int SMALL_FILE_SIZE = 100;
+ static {
+ try {
+ localFs = FileSystem.getLocal(conf);
+ rmBufferDirs();
+ } catch(IOException e) {
+ System.out.println(e.getMessage());
+ e.printStackTrace();
+ System.exit(-1);
+ }
+ }
+
+ private static void rmBufferDirs() throws IOException {
+ assertTrue(!localFs.exists(BUFFER_PATH_ROOT) ||
+ localFs.delete(BUFFER_PATH_ROOT, true));
+ }
+
+ private void validateTempDirCreation(int i) throws IOException {
+ File result = createTempFile(SMALL_FILE_SIZE);
+ assertTrue("Checking for " + BUFFER_DIR[i] + " in " + result + " - FAILED!",
+ result.getPath().startsWith(new File(BUFFER_DIR[i], FILENAME).getPath()));
+ }
+
+ private File createTempFile() throws IOException {
+ File result = dirAllocator.createTmpFileForWrite(FILENAME, -1, conf);
+ result.delete();
+ return result;
+ }
+
+ private File createTempFile(long size) throws IOException {
+ File result = dirAllocator.createTmpFileForWrite(FILENAME, size, conf);
+ result.delete();
+ return result;
+ }
+
+ /** Two buffer dirs. The first dir does not exist & is on a read-only disk;
+ * The second dir exists & is RW
+ * @throws Exception
+ */
+ public void test0() throws Exception {
+ if (isWindows) return;
+ try {
+ conf.set(CONTEXT, BUFFER_DIR[0]+","+BUFFER_DIR[1]);
+ assertTrue(localFs.mkdirs(BUFFER_PATH[1]));
+ BUFFER_ROOT.setReadOnly();
+ validateTempDirCreation(1);
+ validateTempDirCreation(1);
+ } finally {
+ Shell.execCommand(new String[]{"chmod", "u+w", BUFFER_DIR_ROOT});
+ rmBufferDirs();
+ }
+ }
+
+ /** Two buffer dirs. The first dir exists & is on a read-only disk;
+ * The second dir exists & is RW
+ * @throws Exception
+ */
+ public void test1() throws Exception {
+ if (isWindows) return;
+ try {
+ conf.set(CONTEXT, BUFFER_DIR[1]+","+BUFFER_DIR[2]);
+ assertTrue(localFs.mkdirs(BUFFER_PATH[2]));
+ BUFFER_ROOT.setReadOnly();
+ validateTempDirCreation(2);
+ validateTempDirCreation(2);
+ } finally {
+ Shell.execCommand(new String[]{"chmod", "u+w", BUFFER_DIR_ROOT});
+ rmBufferDirs();
+ }
+ }
+ /** Two buffer dirs. Both do not exist but on a RW disk.
+ * Check if tmp dirs are allocated in a round-robin
+ */
+ public void test2() throws Exception {
+ if (isWindows) return;
+ try {
+ conf.set(CONTEXT, BUFFER_DIR[2]+","+BUFFER_DIR[3]);
+
+ // create the first file, and then figure the round-robin sequence
+ createTempFile(SMALL_FILE_SIZE);
+ int firstDirIdx = (dirAllocator.getCurrentDirectoryIndex() == 0) ? 2 : 3;
+ int secondDirIdx = (firstDirIdx == 2) ? 3 : 2;
+
+ // check if tmp dirs are allocated in a round-robin manner
+ validateTempDirCreation(firstDirIdx);
+ validateTempDirCreation(secondDirIdx);
+ validateTempDirCreation(firstDirIdx);
+ } finally {
+ rmBufferDirs();
+ }
+ }
+
+ /** Two buffer dirs. Both exists and on a R/W disk.
+ * Later disk1 becomes read-only.
+ * @throws Exception
+ */
+ public void test3() throws Exception {
+ if (isWindows) return;
+ try {
+ conf.set(CONTEXT, BUFFER_DIR[3]+","+BUFFER_DIR[4]);
+ assertTrue(localFs.mkdirs(BUFFER_PATH[3]));
+ assertTrue(localFs.mkdirs(BUFFER_PATH[4]));
+
+ // create the first file with size, and then figure the round-robin sequence
+ createTempFile(SMALL_FILE_SIZE);
+
+ int nextDirIdx = (dirAllocator.getCurrentDirectoryIndex() == 0) ? 3 : 4;
+ validateTempDirCreation(nextDirIdx);
+
+ // change buffer directory 2 to be read only
+ new File(BUFFER_DIR[4]).setReadOnly();
+ validateTempDirCreation(3);
+ validateTempDirCreation(3);
+ } finally {
+ rmBufferDirs();
+ }
+ }
+
+ /**
+ * Two buffer dirs, on read-write disk.
+ *
+ * Try to create a whole bunch of files.
+ * Verify that they do indeed all get created where they should.
+ *
+ * Would ideally check statistical properties of distribution, but
+ * we don't have the nerve to risk false-positives here.
+ *
+ * @throws Exception
+ */
+ static final int TRIALS = 100;
+ public void test4() throws Exception {
+ if (isWindows) return;
+ try {
+
+ conf.set(CONTEXT, BUFFER_DIR[5]+","+BUFFER_DIR[6]);
+ assertTrue(localFs.mkdirs(BUFFER_PATH[5]));
+ assertTrue(localFs.mkdirs(BUFFER_PATH[6]));
+
+ int inDir5=0, inDir6=0;
+ for(int i = 0; i < TRIALS; ++i) {
+ File result = createTempFile();
+ if(result.getPath().startsWith(new File(BUFFER_DIR[5], FILENAME).getPath())) {
+ inDir5++;
+ } else if(result.getPath().startsWith(new File(BUFFER_DIR[6], FILENAME).getPath())) {
+ inDir6++;
+ }
+ result.delete();
+ }
+
+ assertTrue( inDir5 + inDir6 == TRIALS);
+
+ } finally {
+ rmBufferDirs();
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/TestLocalFileSystem.java b/src/test/org/apache/hadoop/fs/TestLocalFileSystem.java
new file mode 100644
index 00000000000..b244b9b5df4
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestLocalFileSystem.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import org.apache.hadoop.conf.Configuration;
+import java.io.*;
+import junit.framework.*;
+
+/**
+ * This class tests the local file system via the FileSystem abstraction.
+ */
+public class TestLocalFileSystem extends TestCase {
+ private static String TEST_ROOT_DIR
+ = System.getProperty("test.build.data","build/test/data/work-dir/localfs");
+
+
+ static void writeFile(FileSystem fs, Path name) throws IOException {
+ FSDataOutputStream stm = fs.create(name);
+ stm.writeBytes("42\n");
+ stm.close();
+ }
+
+ static String readFile(FileSystem fs, Path name) throws IOException {
+ byte[] b = new byte[1024];
+ int offset = 0;
+ FSDataInputStream in = fs.open(name);
+ for(int remaining, n;
+ (remaining = b.length - offset) > 0 && (n = in.read(b, offset, remaining)) != -1;
+ offset += n);
+ in.close();
+
+ String s = new String(b, 0, offset);
+ System.out.println("s=" + s);
+ return s;
+ }
+
+ private void cleanupFile(FileSystem fs, Path name) throws IOException {
+ assertTrue(fs.exists(name));
+ fs.delete(name, true);
+ assertTrue(!fs.exists(name));
+ }
+
+ /**
+ * Test the capability of setting the working directory.
+ */
+ public void testWorkingDirectory() throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fileSys = FileSystem.getLocal(conf);
+ Path origDir = fileSys.getWorkingDirectory();
+ Path subdir = new Path(TEST_ROOT_DIR, "new");
+ try {
+ // make sure it doesn't already exist
+ assertTrue(!fileSys.exists(subdir));
+ // make it and check for it
+ assertTrue(fileSys.mkdirs(subdir));
+ assertTrue(fileSys.isDirectory(subdir));
+
+ fileSys.setWorkingDirectory(subdir);
+
+ // create a directory and check for it
+ Path dir1 = new Path("dir1");
+ assertTrue(fileSys.mkdirs(dir1));
+ assertTrue(fileSys.isDirectory(dir1));
+
+ // delete the directory and make sure it went away
+ fileSys.delete(dir1, true);
+ assertTrue(!fileSys.exists(dir1));
+
+ // create files and manipulate them.
+ Path file1 = new Path("file1");
+ Path file2 = new Path("sub/file2");
+ writeFile(fileSys, file1);
+ fileSys.copyFromLocalFile(file1, file2);
+ assertTrue(fileSys.exists(file1));
+ assertTrue(fileSys.isFile(file1));
+ cleanupFile(fileSys, file2);
+ fileSys.copyToLocalFile(file1, file2);
+ cleanupFile(fileSys, file2);
+
+ // try a rename
+ fileSys.rename(file1, file2);
+ assertTrue(!fileSys.exists(file1));
+ assertTrue(fileSys.exists(file2));
+ fileSys.rename(file2, file1);
+
+ // try reading a file
+ InputStream stm = fileSys.open(file1);
+ byte[] buffer = new byte[3];
+ int bytesRead = stm.read(buffer, 0, 3);
+ assertEquals("42\n", new String(buffer, 0, bytesRead));
+ stm.close();
+ } finally {
+ fileSys.setWorkingDirectory(origDir);
+ fileSys.delete(subdir, true);
+ }
+ }
+
+ public void testCopy() throws IOException {
+ Configuration conf = new Configuration();
+ LocalFileSystem fs = FileSystem.getLocal(conf);
+ Path src = new Path(TEST_ROOT_DIR, "dingo");
+ Path dst = new Path(TEST_ROOT_DIR, "yak");
+ writeFile(fs, src);
+ assertTrue(FileUtil.copy(fs, src, fs, dst, true, false, conf));
+ assertTrue(!fs.exists(src) && fs.exists(dst));
+ assertTrue(FileUtil.copy(fs, dst, fs, src, false, false, conf));
+ assertTrue(fs.exists(src) && fs.exists(dst));
+ assertTrue(FileUtil.copy(fs, src, fs, dst, true, true, conf));
+ assertTrue(!fs.exists(src) && fs.exists(dst));
+ fs.mkdirs(src);
+ assertTrue(FileUtil.copy(fs, dst, fs, src, false, false, conf));
+ Path tmp = new Path(src, dst.getName());
+ assertTrue(fs.exists(tmp) && fs.exists(dst));
+ assertTrue(FileUtil.copy(fs, dst, fs, src, false, true, conf));
+ assertTrue(fs.delete(tmp, true));
+ fs.mkdirs(tmp);
+ try {
+ FileUtil.copy(fs, dst, fs, src, true, true, conf);
+ fail("Failed to detect existing dir");
+ } catch (IOException e) { }
+ }
+
+ public void testHomeDirectory() throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fileSys = FileSystem.getLocal(conf);
+ Path home = new Path(System.getProperty("user.home"))
+ .makeQualified(fileSys);
+ Path fsHome = fileSys.getHomeDirectory();
+ assertEquals(home, fsHome);
+ }
+
+ public void testPathEscapes() throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.getLocal(conf);
+ Path path = new Path(TEST_ROOT_DIR, "foo%bar");
+ writeFile(fs, path);
+ FileStatus status = fs.getFileStatus(path);
+ assertEquals(path.makeQualified(fs), status.getPath());
+ cleanupFile(fs, path);
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestLocalFileSystemPermission.java b/src/test/org/apache/hadoop/fs/TestLocalFileSystemPermission.java
new file mode 100644
index 00000000000..f68cdb66cdf
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestLocalFileSystemPermission.java
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.permission.*;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Shell;
+
+import java.io.*;
+import java.util.*;
+
+import junit.framework.*;
+
+/**
+ * This class tests the local file system via the FileSystem abstraction.
+ */
+public class TestLocalFileSystemPermission extends TestCase {
+ static final String TEST_PATH_PREFIX = new Path(System.getProperty(
+ "test.build.data", "/tmp")).toString().replace(' ', '_')
+ + "/" + TestLocalFileSystemPermission.class.getSimpleName() + "_";
+
+ {
+ try {
+ ((org.apache.commons.logging.impl.Log4JLogger)FileSystem.LOG).getLogger()
+ .setLevel(org.apache.log4j.Level.DEBUG);
+ }
+ catch(Exception e) {
+ System.out.println("Cannot change log level\n"
+ + StringUtils.stringifyException(e));
+ }
+ }
+
+ private Path writeFile(FileSystem fs, String name) throws IOException {
+ Path f = new Path(TEST_PATH_PREFIX + name);
+ FSDataOutputStream stm = fs.create(f);
+ stm.writeBytes("42\n");
+ stm.close();
+ return f;
+ }
+
+ private void cleanupFile(FileSystem fs, Path name) throws IOException {
+ assertTrue(fs.exists(name));
+ fs.delete(name, true);
+ assertTrue(!fs.exists(name));
+ }
+
+ /** Test LocalFileSystem.setPermission */
+ public void testLocalFSsetPermission() throws IOException {
+ if (Path.WINDOWS) {
+ System.out.println("Cannot run test for Windows");
+ return;
+ }
+ Configuration conf = new Configuration();
+ LocalFileSystem localfs = FileSystem.getLocal(conf);
+ String filename = "foo";
+ Path f = writeFile(localfs, filename);
+ try {
+ System.out.println(filename + ": " + getPermission(localfs, f));
+ }
+ catch(Exception e) {
+ System.out.println(StringUtils.stringifyException(e));
+ System.out.println("Cannot run test");
+ return;
+ }
+
+ try {
+ // create files and manipulate them.
+ FsPermission all = new FsPermission((short)0777);
+ FsPermission none = new FsPermission((short)0);
+
+ localfs.setPermission(f, none);
+ assertEquals(none, getPermission(localfs, f));
+
+ localfs.setPermission(f, all);
+ assertEquals(all, getPermission(localfs, f));
+ }
+ finally {cleanupFile(localfs, f);}
+ }
+
+ FsPermission getPermission(LocalFileSystem fs, Path p) throws IOException {
+ return fs.getFileStatus(p).getPermission();
+ }
+
+ /** Test LocalFileSystem.setOwner */
+ public void testLocalFSsetOwner() throws IOException {
+ if (Path.WINDOWS) {
+ System.out.println("Cannot run test for Windows");
+ return;
+ }
+
+ Configuration conf = new Configuration();
+ LocalFileSystem localfs = FileSystem.getLocal(conf);
+ String filename = "bar";
+ Path f = writeFile(localfs, filename);
+ List groups = null;
+ try {
+ groups = getGroups();
+ System.out.println(filename + ": " + getPermission(localfs, f));
+ }
+ catch(IOException e) {
+ System.out.println(StringUtils.stringifyException(e));
+ System.out.println("Cannot run test");
+ return;
+ }
+ if (groups == null || groups.size() < 1) {
+ System.out.println("Cannot run test: need at least one group. groups="
+ + groups);
+ return;
+ }
+
+ // create files and manipulate them.
+ try {
+ String g0 = groups.get(0);
+ localfs.setOwner(f, null, g0);
+ assertEquals(g0, getGroup(localfs, f));
+
+ if (groups.size() > 1) {
+ String g1 = groups.get(1);
+ localfs.setOwner(f, null, g1);
+ assertEquals(g1, getGroup(localfs, f));
+ } else {
+ System.out.println("Not testing changing the group since user " +
+ "belongs to only one group.");
+ }
+ }
+ finally {cleanupFile(localfs, f);}
+ }
+
+ static List getGroups() throws IOException {
+ List a = new ArrayList();
+ String s = Shell.execCommand(Shell.getGROUPS_COMMAND());
+ for(StringTokenizer t = new StringTokenizer(s); t.hasMoreTokens(); ) {
+ a.add(t.nextToken());
+ }
+ return a;
+ }
+
+ String getGroup(LocalFileSystem fs, Path p) throws IOException {
+ return fs.getFileStatus(p).getGroup();
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestPath.java b/src/test/org/apache/hadoop/fs/TestPath.java
new file mode 100644
index 00000000000..4fa28bc77ce
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestPath.java
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs;
+
+import java.util.*;
+import junit.framework.TestCase;
+
+public class TestPath extends TestCase {
+ public void testToString() {
+ toStringTest("/");
+ toStringTest("/foo");
+ toStringTest("/foo/bar");
+ toStringTest("foo");
+ toStringTest("foo/bar");
+ boolean emptyException = false;
+ try {
+ toStringTest("");
+ } catch (IllegalArgumentException e) {
+ // expect to receive an IllegalArgumentException
+ emptyException = true;
+ }
+ assertTrue(emptyException);
+ if (Path.WINDOWS) {
+ toStringTest("c:");
+ toStringTest("c:/");
+ toStringTest("c:foo");
+ toStringTest("c:foo/bar");
+ toStringTest("c:foo/bar");
+ toStringTest("c:/foo/bar");
+ }
+ }
+
+ private void toStringTest(String pathString) {
+ assertEquals(pathString, new Path(pathString).toString());
+ }
+
+ public void testNormalize() {
+ assertEquals("/", new Path("//").toString());
+ assertEquals("/foo", new Path("/foo/").toString());
+ assertEquals("/foo", new Path("/foo/").toString());
+ assertEquals("foo", new Path("foo/").toString());
+ assertEquals("foo", new Path("foo//").toString());
+ assertEquals("foo/bar", new Path("foo//bar").toString());
+ if (Path.WINDOWS) {
+ assertEquals("c:/a/b", new Path("c:\\a\\b").toString());
+ }
+ }
+
+ public void testIsAbsolute() {
+ assertTrue(new Path("/").isAbsolute());
+ assertTrue(new Path("/foo").isAbsolute());
+ assertFalse(new Path("foo").isAbsolute());
+ assertFalse(new Path("foo/bar").isAbsolute());
+ assertFalse(new Path(".").isAbsolute());
+ if (Path.WINDOWS) {
+ assertTrue(new Path("c:/a/b").isAbsolute());
+ assertFalse(new Path("c:a/b").isAbsolute());
+ }
+ }
+
+ public void testParent() {
+ assertEquals(new Path("/foo"), new Path("/foo/bar").getParent());
+ assertEquals(new Path("foo"), new Path("foo/bar").getParent());
+ assertEquals(new Path("/"), new Path("/foo").getParent());
+ if (Path.WINDOWS) {
+ assertEquals(new Path("c:/"), new Path("c:/foo").getParent());
+ }
+ }
+
+ public void testChild() {
+ assertEquals(new Path("."), new Path(".", "."));
+ assertEquals(new Path("/"), new Path("/", "."));
+ assertEquals(new Path("/"), new Path(".", "/"));
+ assertEquals(new Path("/foo"), new Path("/", "foo"));
+ assertEquals(new Path("/foo/bar"), new Path("/foo", "bar"));
+ assertEquals(new Path("/foo/bar/baz"), new Path("/foo/bar", "baz"));
+ assertEquals(new Path("/foo/bar/baz"), new Path("/foo", "bar/baz"));
+ assertEquals(new Path("foo"), new Path(".", "foo"));
+ assertEquals(new Path("foo/bar"), new Path("foo", "bar"));
+ assertEquals(new Path("foo/bar/baz"), new Path("foo", "bar/baz"));
+ assertEquals(new Path("foo/bar/baz"), new Path("foo/bar", "baz"));
+ assertEquals(new Path("/foo"), new Path("/bar", "/foo"));
+ if (Path.WINDOWS) {
+ assertEquals(new Path("c:/foo"), new Path("/bar", "c:/foo"));
+ assertEquals(new Path("c:/foo"), new Path("d:/bar", "c:/foo"));
+ }
+ }
+
+ public void testEquals() {
+ assertFalse(new Path("/").equals(new Path("/foo")));
+ }
+
+ public void testDots() {
+ // Test Path(String)
+ assertEquals(new Path("/foo/bar/baz").toString(), "/foo/bar/baz");
+ assertEquals(new Path("/foo/bar", ".").toString(), "/foo/bar");
+ assertEquals(new Path("/foo/bar/../baz").toString(), "/foo/baz");
+ assertEquals(new Path("/foo/bar/./baz").toString(), "/foo/bar/baz");
+ assertEquals(new Path("/foo/bar/baz/../../fud").toString(), "/foo/fud");
+ assertEquals(new Path("/foo/bar/baz/.././../fud").toString(), "/foo/fud");
+ assertEquals(new Path("../../foo/bar").toString(), "../../foo/bar");
+ assertEquals(new Path(".././../foo/bar").toString(), "../../foo/bar");
+ assertEquals(new Path("./foo/bar/baz").toString(), "foo/bar/baz");
+ assertEquals(new Path("/foo/bar/../../baz/boo").toString(), "/baz/boo");
+ assertEquals(new Path("foo/bar/").toString(), "foo/bar");
+ assertEquals(new Path("foo/bar/../baz").toString(), "foo/baz");
+ assertEquals(new Path("foo/bar/../../baz/boo").toString(), "baz/boo");
+
+
+ // Test Path(Path,Path)
+ assertEquals(new Path("/foo/bar", "baz/boo").toString(), "/foo/bar/baz/boo");
+ assertEquals(new Path("foo/bar/","baz/bud").toString(), "foo/bar/baz/bud");
+
+ assertEquals(new Path("/foo/bar","../../boo/bud").toString(), "/boo/bud");
+ assertEquals(new Path("foo/bar","../../boo/bud").toString(), "boo/bud");
+ assertEquals(new Path(".","boo/bud").toString(), "boo/bud");
+
+ assertEquals(new Path("/foo/bar/baz","../../boo/bud").toString(), "/foo/boo/bud");
+ assertEquals(new Path("foo/bar/baz","../../boo/bud").toString(), "foo/boo/bud");
+
+
+ assertEquals(new Path("../../","../../boo/bud").toString(), "../../../../boo/bud");
+ assertEquals(new Path("../../foo","../../../boo/bud").toString(), "../../../../boo/bud");
+ assertEquals(new Path("../../foo/bar","../boo/bud").toString(), "../../foo/boo/bud");
+
+ assertEquals(new Path("foo/bar/baz","../../..").toString(), "");
+ assertEquals(new Path("foo/bar/baz","../../../../..").toString(), "../..");
+ }
+
+ public void testScheme() throws java.io.IOException {
+ assertEquals("foo:/bar", new Path("foo:/","/bar").toString());
+ assertEquals("foo://bar/baz", new Path("foo://bar/","/baz").toString());
+ }
+
+
+}
diff --git a/src/test/org/apache/hadoop/fs/TestTrash.java b/src/test/org/apache/hadoop/fs/TestTrash.java
new file mode 100644
index 00000000000..cff1f2419b7
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestTrash.java
@@ -0,0 +1,313 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+
+import junit.framework.TestCase;
+import java.io.File;
+import java.io.IOException;
+import java.io.DataOutputStream;
+import java.net.URI;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FsShell;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.Trash;
+import org.apache.hadoop.fs.LocalFileSystem;
+
+/**
+ * This class tests commands from Trash.
+ */
+public class TestTrash extends TestCase {
+
+ private final static Path TEST_DIR =
+ new Path(new File(System.getProperty("test.build.data","/tmp")
+ ).toURI().toString().replace(' ', '+'), "testTrash");
+
+ protected static Path writeFile(FileSystem fs, Path f) throws IOException {
+ DataOutputStream out = fs.create(f);
+ out.writeBytes("dhruba: " + f);
+ out.close();
+ assertTrue(fs.exists(f));
+ return f;
+ }
+
+ protected static Path mkdir(FileSystem fs, Path p) throws IOException {
+ assertTrue(fs.mkdirs(p));
+ assertTrue(fs.exists(p));
+ assertTrue(fs.getFileStatus(p).isDir());
+ return p;
+ }
+
+ // check that the specified file is in Trash
+ protected static void checkTrash(FileSystem fs, Path trashRoot,
+ Path path) throws IOException {
+ Path p = new Path(trashRoot+"/"+ path.toUri().getPath());
+ assertTrue(fs.exists(p));
+ }
+
+ // check that the specified file is not in Trash
+ static void checkNotInTrash(FileSystem fs, Path trashRoot, String pathname)
+ throws IOException {
+ Path p = new Path(trashRoot+"/"+ new Path(pathname).getName());
+ assertTrue(!fs.exists(p));
+ }
+
+ protected static void trashShell(final FileSystem fs, final Path base)
+ throws IOException {
+ Configuration conf = new Configuration();
+ conf.set("fs.trash.interval", "10"); // 10 minute
+ conf.set("fs.default.name", fs.getUri().toString());
+ FsShell shell = new FsShell();
+ shell.setConf(conf);
+ Path trashRoot = null;
+
+ // First create a new directory with mkdirs
+ Path myPath = new Path(base, "test/mkdirs");
+ mkdir(fs, myPath);
+
+ // Second, create a file in that directory.
+ Path myFile = new Path(base, "test/mkdirs/myFile");
+ writeFile(fs, myFile);
+
+ // Verify that expunge without Trash directory
+ // won't throw Exception
+ {
+ String[] args = new String[1];
+ args[0] = "-expunge";
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+ }
+
+ // Verify that we succeed in removing the file we created.
+ // This should go into Trash.
+ {
+ String[] args = new String[2];
+ args[0] = "-rm";
+ args[1] = myFile.toString();
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+
+ trashRoot = shell.getCurrentTrashDir();
+ checkTrash(fs, trashRoot, myFile);
+ }
+
+ // Verify that we can recreate the file
+ writeFile(fs, myFile);
+
+ // Verify that we succeed in removing the file we re-created
+ {
+ String[] args = new String[2];
+ args[0] = "-rm";
+ args[1] = new Path(base, "test/mkdirs/myFile").toString();
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+ }
+
+ // Verify that we can recreate the file
+ writeFile(fs, myFile);
+
+ // Verify that we succeed in removing the whole directory
+ // along with the file inside it.
+ {
+ String[] args = new String[2];
+ args[0] = "-rmr";
+ args[1] = new Path(base, "test/mkdirs").toString();
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+ }
+
+ // recreate directory
+ mkdir(fs, myPath);
+
+ // Verify that we succeed in removing the whole directory
+ {
+ String[] args = new String[2];
+ args[0] = "-rmr";
+ args[1] = new Path(base, "test/mkdirs").toString();
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+ }
+
+ // Check that we can delete a file from the trash
+ {
+ Path toErase = new Path(trashRoot, "toErase");
+ int retVal = -1;
+ writeFile(fs, toErase);
+ try {
+ retVal = shell.run(new String[] {"-rm", toErase.toString()});
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(retVal == 0);
+ checkNotInTrash (fs, trashRoot, toErase.toString());
+ checkNotInTrash (fs, trashRoot, toErase.toString()+".1");
+ }
+
+ // simulate Trash removal
+ {
+ String[] args = new String[1];
+ args[0] = "-expunge";
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+ }
+
+ // verify that after expunging the Trash, it really goes away
+ checkNotInTrash(fs, trashRoot, new Path(base, "test/mkdirs/myFile").toString());
+
+ // recreate directory and file
+ mkdir(fs, myPath);
+ writeFile(fs, myFile);
+
+ // remove file first, then remove directory
+ {
+ String[] args = new String[2];
+ args[0] = "-rm";
+ args[1] = myFile.toString();
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+ checkTrash(fs, trashRoot, myFile);
+
+ args = new String[2];
+ args[0] = "-rmr";
+ args[1] = myPath.toString();
+ val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == 0);
+ checkTrash(fs, trashRoot, myPath);
+ }
+
+ // attempt to remove parent of trash
+ {
+ String[] args = new String[2];
+ args[0] = "-rmr";
+ args[1] = trashRoot.getParent().getParent().toString();
+ int val = -1;
+ try {
+ val = shell.run(args);
+ } catch (Exception e) {
+ System.err.println("Exception raised from Trash.run " +
+ e.getLocalizedMessage());
+ }
+ assertTrue(val == -1);
+ assertTrue(fs.exists(trashRoot));
+ }
+ }
+
+ public static void trashNonDefaultFS(Configuration conf) throws IOException {
+ conf.set("fs.trash.interval", "10"); // 10 minute
+ // attempt non-default FileSystem trash
+ {
+ final FileSystem lfs = FileSystem.getLocal(conf);
+ Path p = TEST_DIR;
+ Path f = new Path(p, "foo/bar");
+ if (lfs.exists(p)) {
+ lfs.delete(p, true);
+ }
+ try {
+ f = writeFile(lfs, f);
+
+ FileSystem.closeAll();
+ FileSystem localFs = FileSystem.get(URI.create("file:///"), conf);
+ Trash lTrash = new Trash(localFs, conf);
+ lTrash.moveToTrash(f.getParent());
+ checkTrash(localFs, lTrash.getCurrentTrashDir(), f);
+ } finally {
+ if (lfs.exists(p)) {
+ lfs.delete(p, true);
+ }
+ }
+ }
+ }
+
+ public void testTrash() throws IOException {
+ Configuration conf = new Configuration();
+ conf.setClass("fs.file.impl", TestLFS.class, FileSystem.class);
+ trashShell(FileSystem.getLocal(conf), TEST_DIR);
+ }
+
+ public void testNonDefaultFS() throws IOException {
+ Configuration conf = new Configuration();
+ conf.setClass("fs.file.impl", TestLFS.class, FileSystem.class);
+ conf.set("fs.default.name", "invalid://host/bar/foo");
+ trashNonDefaultFS(conf);
+ }
+
+ static class TestLFS extends LocalFileSystem {
+ Path home;
+ TestLFS() {
+ this(TEST_DIR);
+ }
+ TestLFS(Path home) {
+ super();
+ this.home = home;
+ }
+ public Path getHomeDirectory() {
+ return home;
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/TestTruncatedInputBug.java b/src/test/org/apache/hadoop/fs/TestTruncatedInputBug.java
new file mode 100644
index 00000000000..e7dabf903cd
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/TestTruncatedInputBug.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * test for the input truncation bug when mark/reset is used.
+ * HADOOP-1489
+ */
+public class TestTruncatedInputBug extends TestCase {
+ private static String TEST_ROOT_DIR =
+ new Path(System.getProperty("test.build.data","/tmp"))
+ .toString().replace(' ', '+');
+
+ private void writeFile(FileSystem fileSys,
+ Path name, int nBytesToWrite)
+ throws IOException {
+ DataOutputStream out = fileSys.create(name);
+ for (int i = 0; i < nBytesToWrite; ++i) {
+ out.writeByte(0);
+ }
+ out.close();
+ }
+
+ /**
+ * When mark() is used on BufferedInputStream, the request
+ * size on the checksum file system can be small. However,
+ * checksum file system currently depends on the request size
+ * >= bytesPerSum to work properly.
+ */
+ public void testTruncatedInputBug() throws IOException {
+ final int ioBufSize = 512;
+ final int fileSize = ioBufSize*4;
+ int filePos = 0;
+
+ Configuration conf = new Configuration();
+ conf.setInt("io.file.buffer.size", ioBufSize);
+ FileSystem fileSys = FileSystem.getLocal(conf);
+
+ try {
+ // First create a test input file.
+ Path testFile = new Path(TEST_ROOT_DIR, "HADOOP-1489");
+ writeFile(fileSys, testFile, fileSize);
+ assertTrue(fileSys.exists(testFile));
+ assertTrue(fileSys.getFileStatus(testFile).getLen() == fileSize);
+
+ // Now read the file for ioBufSize bytes
+ FSDataInputStream in = fileSys.open(testFile, ioBufSize);
+ // seek beyond data buffered by open
+ filePos += ioBufSize * 2 + (ioBufSize - 10);
+ in.seek(filePos);
+
+ // read 4 more bytes before marking
+ for (int i = 0; i < 4; ++i) {
+ if (in.read() == -1) {
+ break;
+ }
+ ++filePos;
+ }
+
+ // Now set mark() to trigger the bug
+ // NOTE: in the fixed code, mark() does nothing (not supported) and
+ // hence won't trigger this bug.
+ in.mark(1);
+ System.out.println("MARKED");
+
+ // Try to read the rest
+ while (filePos < fileSize) {
+ if (in.read() == -1) {
+ break;
+ }
+ ++filePos;
+ }
+ in.close();
+
+ System.out.println("Read " + filePos + " bytes."
+ + " file size=" + fileSize);
+ assertTrue(filePos == fileSize);
+
+ } finally {
+ try {
+ fileSys.close();
+ } catch (Exception e) {
+ // noop
+ }
+ }
+ } // end testTruncatedInputBug
+}
diff --git a/src/test/org/apache/hadoop/fs/kfs/KFSEmulationImpl.java b/src/test/org/apache/hadoop/fs/kfs/KFSEmulationImpl.java
new file mode 100644
index 00000000000..9c7b5bafef4
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/kfs/KFSEmulationImpl.java
@@ -0,0 +1,150 @@
+/**
+ *
+ * Licensed under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * @author: Sriram Rao (Kosmix Corp.)
+ *
+ * We need to provide the ability to the code in fs/kfs without really
+ * having a KFS deployment. For this purpose, use the LocalFileSystem
+ * as a way to "emulate" KFS.
+ */
+
+package org.apache.hadoop.fs.kfs;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Progressable;
+
+public class KFSEmulationImpl implements IFSImpl {
+ FileSystem localFS;
+
+ public KFSEmulationImpl(Configuration conf) throws IOException {
+ localFS = FileSystem.getLocal(conf);
+ }
+
+ public boolean exists(String path) throws IOException {
+ return localFS.exists(new Path(path));
+ }
+ public boolean isDirectory(String path) throws IOException {
+ return localFS.isDirectory(new Path(path));
+ }
+ public boolean isFile(String path) throws IOException {
+ return localFS.isFile(new Path(path));
+ }
+
+ public String[] readdir(String path) throws IOException {
+ FileStatus[] p = localFS.listStatus(new Path(path));
+ String[] entries = null;
+
+ if (p == null) {
+ return null;
+ }
+
+ entries = new String[p.length];
+ for (int i = 0; i < p.length; i++)
+ entries[i] = p[i].getPath().toString();
+ return entries;
+ }
+
+ public FileStatus[] readdirplus(Path path) throws IOException {
+ return localFS.listStatus(path);
+ }
+
+ public int mkdirs(String path) throws IOException {
+ if (localFS.mkdirs(new Path(path)))
+ return 0;
+
+ return -1;
+ }
+
+ public int rename(String source, String dest) throws IOException {
+ if (localFS.rename(new Path(source), new Path(dest)))
+ return 0;
+ return -1;
+ }
+
+ public int rmdir(String path) throws IOException {
+ if (isDirectory(path)) {
+ // the directory better be empty
+ String[] dirEntries = readdir(path);
+ if ((dirEntries.length <= 2) && (localFS.delete(new Path(path), true)))
+ return 0;
+ }
+ return -1;
+ }
+
+ public int remove(String path) throws IOException {
+ if (isFile(path) && (localFS.delete(new Path(path), true)))
+ return 0;
+ return -1;
+ }
+
+ public long filesize(String path) throws IOException {
+ return localFS.getFileStatus(new Path(path)).getLen();
+ }
+ public short getReplication(String path) throws IOException {
+ return 1;
+ }
+ public short setReplication(String path, short replication) throws IOException {
+ return 1;
+ }
+ public String[][] getDataLocation(String path, long start, long len) throws IOException {
+ BlockLocation[] blkLocations =
+ localFS.getFileBlockLocations(localFS.getFileStatus(new Path(path)),
+ start, len);
+ if ((blkLocations == null) || (blkLocations.length == 0)) {
+ return new String[0][];
+ }
+ int blkCount = blkLocations.length;
+ String[][]hints = new String[blkCount][];
+ for (int i=0; i < blkCount ; i++) {
+ String[] hosts = blkLocations[i].getHosts();
+ hints[i] = new String[hosts.length];
+ hints[i] = hosts;
+ }
+ return hints;
+ }
+
+ public long getModificationTime(String path) throws IOException {
+ FileStatus s = localFS.getFileStatus(new Path(path));
+ if (s == null)
+ return 0;
+
+ return s.getModificationTime();
+ }
+
+ public FSDataOutputStream append(String path, int bufferSize, Progressable progress) throws IOException {
+ // besides path/overwrite, the other args don't matter for
+ // testing purposes.
+ return localFS.append(new Path(path));
+ }
+
+ public FSDataOutputStream create(String path, short replication, int bufferSize, Progressable progress) throws IOException {
+ // besides path/overwrite, the other args don't matter for
+ // testing purposes.
+ return localFS.create(new Path(path));
+ }
+
+ public FSDataInputStream open(String path, int bufferSize) throws IOException {
+ return localFS.open(new Path(path));
+ }
+
+
+};
diff --git a/src/test/org/apache/hadoop/fs/kfs/TestKosmosFileSystem.java b/src/test/org/apache/hadoop/fs/kfs/TestKosmosFileSystem.java
new file mode 100644
index 00000000000..c853f2af3f3
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/kfs/TestKosmosFileSystem.java
@@ -0,0 +1,204 @@
+/**
+ *
+ * Licensed under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * @author: Sriram Rao (Kosmix Corp.)
+ *
+ * Unit tests for testing the KosmosFileSystem API implementation.
+ */
+
+package org.apache.hadoop.fs.kfs;
+
+import java.io.*;
+import java.net.*;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.hadoop.fs.kfs.KosmosFileSystem;
+
+public class TestKosmosFileSystem extends TestCase {
+
+ KosmosFileSystem kosmosFileSystem;
+ KFSEmulationImpl kfsEmul;
+ Path baseDir;
+
+ @Override
+ protected void setUp() throws IOException {
+ Configuration conf = new Configuration();
+
+ kfsEmul = new KFSEmulationImpl(conf);
+ kosmosFileSystem = new KosmosFileSystem(kfsEmul);
+ // a dummy URI; we are not connecting to any setup here
+ kosmosFileSystem.initialize(URI.create("kfs:///"), conf);
+ baseDir = new Path(System.getProperty("test.build.data", "/tmp" ) +
+ "/kfs-test");
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+
+ }
+
+ // @Test
+ // Check all the directory API's in KFS
+ public void testDirs() throws Exception {
+ Path subDir1 = new Path("dir.1");
+
+ // make the dir
+ kosmosFileSystem.mkdirs(baseDir);
+ assertTrue(kosmosFileSystem.isDirectory(baseDir));
+ kosmosFileSystem.setWorkingDirectory(baseDir);
+
+ kosmosFileSystem.mkdirs(subDir1);
+ assertTrue(kosmosFileSystem.isDirectory(subDir1));
+
+ assertFalse(kosmosFileSystem.exists(new Path("test1")));
+ assertFalse(kosmosFileSystem.isDirectory(new Path("test/dir.2")));
+
+ FileStatus[] p = kosmosFileSystem.listStatus(baseDir);
+ assertEquals(p.length, 1);
+
+ kosmosFileSystem.delete(baseDir, true);
+ assertFalse(kosmosFileSystem.exists(baseDir));
+ }
+
+ // @Test
+ // Check the file API's
+ public void testFiles() throws Exception {
+ Path subDir1 = new Path("dir.1");
+ Path file1 = new Path("dir.1/foo.1");
+ Path file2 = new Path("dir.1/foo.2");
+
+ kosmosFileSystem.mkdirs(baseDir);
+ assertTrue(kosmosFileSystem.isDirectory(baseDir));
+ kosmosFileSystem.setWorkingDirectory(baseDir);
+
+ kosmosFileSystem.mkdirs(subDir1);
+
+ FSDataOutputStream s1 = kosmosFileSystem.create(file1, true, 4096, (short) 1, (long) 4096, null);
+ FSDataOutputStream s2 = kosmosFileSystem.create(file2, true, 4096, (short) 1, (long) 4096, null);
+
+ s1.close();
+ s2.close();
+
+ FileStatus[] p = kosmosFileSystem.listStatus(subDir1);
+ assertEquals(p.length, 2);
+
+ kosmosFileSystem.delete(file1, true);
+ p = kosmosFileSystem.listStatus(subDir1);
+ assertEquals(p.length, 1);
+
+ kosmosFileSystem.delete(file2, true);
+ p = kosmosFileSystem.listStatus(subDir1);
+ assertEquals(p.length, 0);
+
+ kosmosFileSystem.delete(baseDir, true);
+ assertFalse(kosmosFileSystem.exists(baseDir));
+ }
+
+ // @Test
+ // Check file/read write
+ public void testFileIO() throws Exception {
+ Path subDir1 = new Path("dir.1");
+ Path file1 = new Path("dir.1/foo.1");
+
+ kosmosFileSystem.mkdirs(baseDir);
+ assertTrue(kosmosFileSystem.isDirectory(baseDir));
+ kosmosFileSystem.setWorkingDirectory(baseDir);
+
+ kosmosFileSystem.mkdirs(subDir1);
+
+ FSDataOutputStream s1 = kosmosFileSystem.create(file1, true, 4096, (short) 1, (long) 4096, null);
+
+ int bufsz = 4096;
+ byte[] data = new byte[bufsz];
+
+ for (int i = 0; i < data.length; i++)
+ data[i] = (byte) (i % 16);
+
+ // write 4 bytes and read them back; read API should return a byte per call
+ s1.write(32);
+ s1.write(32);
+ s1.write(32);
+ s1.write(32);
+ // write some data
+ s1.write(data, 0, data.length);
+ // flush out the changes
+ s1.close();
+
+ // Read the stuff back and verify it is correct
+ FSDataInputStream s2 = kosmosFileSystem.open(file1, 4096);
+ int v;
+ long nread = 0;
+
+ v = s2.read();
+ assertEquals(v, 32);
+ v = s2.read();
+ assertEquals(v, 32);
+ v = s2.read();
+ assertEquals(v, 32);
+ v = s2.read();
+ assertEquals(v, 32);
+
+ assertEquals(s2.available(), data.length);
+
+ byte[] buf = new byte[bufsz];
+ s2.read(buf, 0, buf.length);
+ nread = s2.getPos();
+
+ for (int i = 0; i < data.length; i++)
+ assertEquals(data[i], buf[i]);
+
+ assertEquals(s2.available(), 0);
+
+ s2.close();
+
+ // append some data to the file
+ try {
+ s1 = kosmosFileSystem.append(file1);
+ for (int i = 0; i < data.length; i++)
+ data[i] = (byte) (i % 17);
+ // write the data
+ s1.write(data, 0, data.length);
+ // flush out the changes
+ s1.close();
+
+ // read it back and validate
+ s2 = kosmosFileSystem.open(file1, 4096);
+ s2.seek(nread);
+ s2.read(buf, 0, buf.length);
+ for (int i = 0; i < data.length; i++)
+ assertEquals(data[i], buf[i]);
+
+ s2.close();
+ } catch (Exception e) {
+ System.out.println("append isn't supported by the underlying fs");
+ }
+
+ kosmosFileSystem.delete(file1, true);
+ assertFalse(kosmosFileSystem.exists(file1));
+ kosmosFileSystem.delete(subDir1, true);
+ assertFalse(kosmosFileSystem.exists(subDir1));
+ kosmosFileSystem.delete(baseDir, true);
+ assertFalse(kosmosFileSystem.exists(baseDir));
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/loadGenerator/DataGenerator.java b/src/test/org/apache/hadoop/fs/loadGenerator/DataGenerator.java
new file mode 100644
index 00000000000..4825bbada50
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/loadGenerator/DataGenerator.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.loadGenerator;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * This program reads the directory structure and file structure from
+ * the input directory and creates the namespace in the file system
+ * specified by the configuration in the specified root.
+ * All the files are filled with 'a'.
+ *
+ * The synopsis of the command is
+ * java DataGenerator
+ * -inDir : input directory name where directory/file structures
+ * are stored. Its default value is the current directory.
+ * -root : the name of the root directory which the new namespace
+ * is going to be placed under.
+ * Its default value is "/testLoadSpace".
+ */
+public class DataGenerator extends Configured implements Tool {
+ private File inDir = StructureGenerator.DEFAULT_STRUCTURE_DIRECTORY;
+ private Path root = DEFAULT_ROOT;
+ private FileSystem fs;
+ final static private long BLOCK_SIZE = 10;
+ final static private String USAGE = "java DataGenerator " +
+ "-inDir " +
+ "-root ";
+
+ /** default name of the root where the test namespace will be placed under */
+ final static Path DEFAULT_ROOT = new Path("/testLoadSpace");
+
+ /** Main function.
+ * It first parses the command line arguments.
+ * It then reads the directory structure from the input directory
+ * structure file and creates directory structure in the file system
+ * namespace. Afterwards it reads the file attributes and creates files
+ * in the file. All file content is filled with 'a'.
+ */
+ public int run(String[] args) throws Exception {
+ int exitCode = 0;
+ exitCode = init(args);
+ if (exitCode != 0) {
+ return exitCode;
+ }
+ genDirStructure();
+ genFiles();
+ return exitCode;
+ }
+
+ /** Parse the command line arguments and initialize the data */
+ private int init(String[] args) {
+ try { // initialize file system handle
+ fs = FileSystem.get(getConf());
+ } catch (IOException ioe) {
+ System.err.println("Can not initialize the file system: " +
+ ioe.getLocalizedMessage());
+ return -1;
+ }
+
+ for (int i = 0; i < args.length; i++) { // parse command line
+ if (args[i].equals("-root")) {
+ root = new Path(args[++i]);
+ } else if (args[i].equals("-inDir")) {
+ inDir = new File(args[++i]);
+ } else {
+ System.err.println(USAGE);
+ ToolRunner.printGenericCommandUsage(System.err);
+ System.exit(-1);
+ }
+ }
+ return 0;
+ }
+
+ /** Read directory structure file under the input directory.
+ * Create each directory under the specified root.
+ * The directory names are relative to the specified root.
+ */
+ private void genDirStructure() throws IOException {
+ BufferedReader in = new BufferedReader(
+ new FileReader(new File(inDir,
+ StructureGenerator.DIR_STRUCTURE_FILE_NAME)));
+ String line;
+ while ((line=in.readLine()) != null) {
+ fs.mkdirs(new Path(root+line));
+ }
+ }
+
+ /** Read file structure file under the input directory.
+ * Create each file under the specified root.
+ * The file names are relative to the root.
+ */
+ private void genFiles() throws IOException {
+ BufferedReader in = new BufferedReader(
+ new FileReader(new File(inDir,
+ StructureGenerator.FILE_STRUCTURE_FILE_NAME)));
+ String line;
+ while ((line=in.readLine()) != null) {
+ String[] tokens = line.split(" ");
+ if (tokens.length != 2) {
+ throw new IOException("Expect at most 2 tokens per line: " + line);
+ }
+ String fileName = root+tokens[0];
+ long fileSize = (long)(BLOCK_SIZE*Double.parseDouble(tokens[1]));
+ genFile(new Path(fileName), fileSize);
+ }
+ }
+
+ /** Create a file with the name file and
+ * a length of fileSize. The file is filled with character 'a'.
+ */
+ private void genFile(Path file, long fileSize) throws IOException {
+ FSDataOutputStream out = fs.create(file, true,
+ getConf().getInt("io.file.buffer.size", 4096),
+ (short)getConf().getInt("dfs.replication", 3),
+ fs.getDefaultBlockSize());
+ for(long i=0; i: read probability [0, 1]
+ * with a default value of 0.3333.
+ * -writeProbability : write probability [0, 1]
+ * with a default value of 0.3333.
+ * -root : test space with a default value of /testLoadSpace
+ * -maxDelayBetweenOps :
+ * Max delay in the unit of milliseconds between two operations with a
+ * default value of 0 indicating no delay.
+ * -numOfThreads :
+ * number of threads to spawn with a default value of 200.
+ * -elapsedTime :
+ * the elapsed time of program with a default value of 0
+ * indicating running forever
+ * -startTime : when the threads start to run.
+ * -scriptFile : text file to parse for scripted operation
+ */
+public class LoadGenerator extends Configured implements Tool {
+ public static final Log LOG = LogFactory.getLog(LoadGenerator.class);
+
+ private volatile boolean shouldRun = true;
+ private Path root = DataGenerator.DEFAULT_ROOT;
+ private FileSystem fs;
+ private int maxDelayBetweenOps = 0;
+ private int numOfThreads = 200;
+ private long [] durations = {0};
+ private double [] readProbs = {0.3333};
+ private double [] writeProbs = {0.3333};
+ private volatile int currentIndex = 0;
+ long totalTime = 0;
+ private long startTime = System.currentTimeMillis()+10000;
+ final static private int BLOCK_SIZE = 10;
+ private ArrayList files = new ArrayList(); // a table of file names
+ private ArrayList dirs = new ArrayList(); // a table of directory names
+ private Random r = null;
+ final private static String USAGE = "java LoadGenerator\n" +
+ "-readProbability \n" +
+ "-writeProbability \n" +
+ "-root \n" +
+ "-maxDelayBetweenOps \n" +
+ "-numOfThreads \n" +
+ "-elapsedTime \n" +
+ "-startTime \n" +
+ "-scriptFile ";
+ final private String hostname;
+
+ /** Constructor */
+ public LoadGenerator() throws IOException, UnknownHostException {
+ InetAddress addr = InetAddress.getLocalHost();
+ hostname = addr.getHostName();
+ }
+
+ private final static int OPEN = 0;
+ private final static int LIST = 1;
+ private final static int CREATE = 2;
+ private final static int WRITE_CLOSE = 3;
+ private final static int DELETE = 4;
+ private final static int TOTAL_OP_TYPES =5;
+ private long [] executionTime = new long[TOTAL_OP_TYPES];
+ private long [] totalNumOfOps = new long[TOTAL_OP_TYPES];
+
+ /** A thread sends a stream of requests to the NameNode.
+ * At each iteration, it first decides if it is going to read a file,
+ * create a file, or listing a directory following the read
+ * and write probabilities.
+ * When reading, it randomly picks a file in the test space and reads
+ * the entire file. When writing, it randomly picks a directory in the
+ * test space and creates a file whose name consists of the current
+ * machine's host name and the thread id. The length of the file
+ * follows Gaussian distribution with an average size of 2 blocks and
+ * the standard deviation of 1 block. The new file is filled with 'a'.
+ * Immediately after the file creation completes, the file is deleted
+ * from the test space.
+ * While listing, it randomly picks a directory in the test space and
+ * list the directory content.
+ * Between two consecutive operations, the thread pauses for a random
+ * amount of time in the range of [0, maxDelayBetweenOps]
+ * if the specified max delay is not zero.
+ * A thread runs for the specified elapsed time if the time isn't zero.
+ * Otherwise, it runs forever.
+ */
+ private class DFSClientThread extends Thread {
+ private int id;
+ private long [] executionTime = new long[TOTAL_OP_TYPES];
+ private long [] totalNumOfOps = new long[TOTAL_OP_TYPES];
+ private byte[] buffer = new byte[1024];
+
+ private DFSClientThread(int id) {
+ this.id = id;
+ }
+
+ /** Main loop
+ * Each iteration decides what's the next operation and then pauses.
+ */
+ public void run() {
+ try {
+ while (shouldRun) {
+ nextOp();
+ delay();
+ }
+ } catch (Exception ioe) {
+ System.err.println(ioe.getLocalizedMessage());
+ ioe.printStackTrace();
+ }
+ }
+
+ /** Let the thread pause for a random amount of time in the range of
+ * [0, maxDelayBetweenOps] if the delay is not zero. Otherwise, no pause.
+ */
+ private void delay() throws InterruptedException {
+ if (maxDelayBetweenOps>0) {
+ int delay = r.nextInt(maxDelayBetweenOps);
+ Thread.sleep(delay);
+ }
+ }
+
+ /** Perform the next operation.
+ *
+ * Depending on the read and write probabilities, the next
+ * operation could be either read, write, or list.
+ */
+ private void nextOp() throws IOException {
+ double rn = r.nextDouble();
+ int i = currentIndex;
+
+ if(LOG.isDebugEnabled())
+ LOG.debug("Thread " + this.id + " moving to index " + i);
+
+ if (rn < readProbs[i]) {
+ read();
+ } else if (rn < readProbs[i] + writeProbs[i]) {
+ write();
+ } else {
+ list();
+ }
+ }
+
+ /** Read operation randomly picks a file in the test space and reads
+ * the entire file */
+ private void read() throws IOException {
+ String fileName = files.get(r.nextInt(files.size()));
+ long startTime = System.currentTimeMillis();
+ InputStream in = fs.open(new Path(fileName));
+ executionTime[OPEN] += (System.currentTimeMillis()-startTime);
+ totalNumOfOps[OPEN]++;
+ while (in.read(buffer) != -1) {}
+ in.close();
+ }
+
+ /** The write operation randomly picks a directory in the
+ * test space and creates a file whose name consists of the current
+ * machine's host name and the thread id. The length of the file
+ * follows Gaussian distribution with an average size of 2 blocks and
+ * the standard deviation of 1 block. The new file is filled with 'a'.
+ * Immediately after the file creation completes, the file is deleted
+ * from the test space.
+ */
+ private void write() throws IOException {
+ String dirName = dirs.get(r.nextInt(dirs.size()));
+ Path file = new Path(dirName, hostname+id);
+ double fileSize = 0;
+ while ((fileSize = r.nextGaussian()+2)<=0) {}
+ genFile(file, (long)(fileSize*BLOCK_SIZE));
+ long startTime = System.currentTimeMillis();
+ fs.delete(file, true);
+ executionTime[DELETE] += (System.currentTimeMillis()-startTime);
+ totalNumOfOps[DELETE]++;
+ }
+
+ /** The list operation randomly picks a directory in the test space and
+ * list the directory content.
+ */
+ private void list() throws IOException {
+ String dirName = dirs.get(r.nextInt(dirs.size()));
+ long startTime = System.currentTimeMillis();
+ fs.listStatus(new Path(dirName));
+ executionTime[LIST] += (System.currentTimeMillis()-startTime);
+ totalNumOfOps[LIST]++;
+ }
+ }
+
+ /** Main function:
+ * It first initializes data by parsing the command line arguments.
+ * It then starts the number of DFSClient threads as specified by
+ * the user.
+ * It stops all the threads when the specified elapsed time is passed.
+ * Before exiting, it prints the average execution for
+ * each operation and operation throughput.
+ */
+ public int run(String[] args) throws Exception {
+ int exitCode = init(args);
+ if (exitCode != 0) {
+ return exitCode;
+ }
+
+ barrier();
+
+ DFSClientThread[] threads = new DFSClientThread[numOfThreads];
+ for (int i=0; i 0) {
+ while(shouldRun) {
+ Thread.sleep(durations[currentIndex] * 1000);
+ totalTime += durations[currentIndex];
+
+ // Are we on the final line of the script?
+ if( (currentIndex + 1) == durations.length) {
+ shouldRun = false;
+ } else {
+ if(LOG.isDebugEnabled()) {
+ LOG.debug("Moving to index " + currentIndex + ": r = "
+ + readProbs[currentIndex] + ", w = " + writeProbs
+ + " for duration " + durations[currentIndex]);
+ }
+ currentIndex++;
+ }
+ }
+ }
+
+ LOG.debug("Done with testing. Waiting for threads to finish.");
+ for (DFSClientThread thread : threads) {
+ thread.join();
+ for (int i=0; i 1) {
+ System.err.println(
+ "The read probability must be [0, 1]: " + readProbs[0]);
+ return -1;
+ }
+ } else if (args[i].equals("-writeProbability")) {
+ if(scriptSpecified) {
+ System.err.println("Can't specify probabilities and use script.");
+ return -1;
+ }
+ writeProbs[0] = Double.parseDouble(args[++i]);
+ if (writeProbs[0] < 0 || writeProbs[0] > 1) {
+ System.err.println(
+ "The write probability must be [0, 1]: " + writeProbs[0]);
+ return -1;
+ }
+ } else if (args[i].equals("-root")) {
+ root = new Path(args[++i]);
+ } else if (args[i].equals("-maxDelayBetweenOps")) {
+ maxDelayBetweenOps = Integer.parseInt(args[++i]); // in milliseconds
+ } else if (args[i].equals("-numOfThreads")) {
+ numOfThreads = Integer.parseInt(args[++i]);
+ if (numOfThreads <= 0) {
+ System.err.println(
+ "Number of threads must be positive: " + numOfThreads);
+ return -1;
+ }
+ } else if (args[i].equals("-startTime")) {
+ startTime = Long.parseLong(args[++i]);
+ } else if (args[i].equals("-elapsedTime")) {
+ if(scriptSpecified) {
+ System.err.println("Can't specify elapsedTime and use script.");
+ return -1;
+ }
+ durations[0] = Long.parseLong(args[++i]);
+ } else if (args[i].equals("-seed")) {
+ r = new Random(Long.parseLong(args[++i])+hostHashCode);
+ } else {
+ System.err.println(USAGE);
+ ToolRunner.printGenericCommandUsage(System.err);
+ return -1;
+ }
+ }
+ } catch (NumberFormatException e) {
+ System.err.println("Illegal parameter: " + e.getLocalizedMessage());
+ System.err.println(USAGE);
+ return -1;
+ }
+
+ for(int i = 0; i < readProbs.length; i++) {
+ if (readProbs[i] + writeProbs[i] <0 || readProbs[i]+ writeProbs[i] > 1) {
+ System.err.println(
+ "The sum of read probability and write probability must be [0, 1]: "
+ + readProbs[i] + " " + writeProbs[i]);
+ return -1;
+ }
+ }
+
+ if (r==null) {
+ r = new Random(System.currentTimeMillis()+hostHashCode);
+ }
+
+ return initFileDirTables();
+ }
+
+ /**
+ * Read a script file of the form: lines of text with duration in seconds,
+ * read probability and write probability, separated by white space.
+ *
+ * @param filename Script file
+ * @return 0 if successful, -1 if not
+ * @throws IOException if errors with file IO
+ */
+ private int loadScriptFile(String filename) throws IOException {
+ FileReader fr = new FileReader(new File(filename));
+ BufferedReader br = new BufferedReader(fr);
+ ArrayList duration = new ArrayList();
+ ArrayList readProb = new ArrayList();
+ ArrayList writeProb = new ArrayList();
+ int lineNum = 0;
+
+ String line;
+ // Read script, parse values, build array of duration, read and write probs
+ while((line = br.readLine()) != null) {
+ lineNum++;
+ if(line.startsWith("#") || line.isEmpty()) // skip comments and blanks
+ continue;
+
+ String[] a = line.split("\\s");
+ if(a.length != 3) {
+ System.err.println("Line " + lineNum +
+ ": Incorrect number of parameters: " + line);
+ }
+
+ try {
+ long d = Long.parseLong(a[0]);
+ if(d < 0) {
+ System.err.println("Line " + lineNum + ": Invalid duration: " + d);
+ return -1;
+ }
+
+ double r = Double.parseDouble(a[1]);
+ if(r < 0.0 || r > 1.0 ) {
+ System.err.println("Line " + lineNum +
+ ": The read probability must be [0, 1]: " + r);
+ return -1;
+ }
+
+ double w = Double.parseDouble(a[2]);
+ if(w < 0.0 || w > 1.0) {
+ System.err.println("Line " + lineNum +
+ ": The read probability must be [0, 1]: " + r);
+ return -1;
+ }
+
+ readProb.add(r);
+ duration.add(d);
+ writeProb.add(w);
+ } catch( NumberFormatException nfe) {
+ System.err.println(lineNum + ": Can't parse: " + line);
+ return -1;
+ }
+ }
+
+ br.close();
+ fr.close();
+
+ // Copy vectors to arrays of values, to avoid autoboxing overhead later
+ durations = new long[duration.size()];
+ readProbs = new double[readProb.size()];
+ writeProbs = new double[writeProb.size()];
+
+ for(int i = 0; i < durations.length; i++) {
+ durations[i] = duration.get(i);
+ readProbs[i] = readProb.get(i);
+ writeProbs[i] = writeProb.get(i);
+ }
+
+ if(durations[0] == 0)
+ System.err.println("Initial duration set to 0. " +
+ "Will loop until stopped manually.");
+
+ return 0;
+ }
+
+ /** Create a table that contains all directories under root and
+ * another table that contains all files under root.
+ */
+ private int initFileDirTables() {
+ try {
+ initFileDirTables(root);
+ } catch (IOException e) {
+ System.err.println(e.getLocalizedMessage());
+ e.printStackTrace();
+ return -1;
+ }
+ if (dirs.isEmpty()) {
+ System.err.println("The test space " + root + " is empty");
+ return -1;
+ }
+ if (files.isEmpty()) {
+ System.err.println("The test space " + root +
+ " does not have any file");
+ return -1;
+ }
+ return 0;
+ }
+
+ /** Create a table that contains all directories under the specified path and
+ * another table that contains all files under the specified path and
+ * whose name starts with "_file_".
+ */
+ private void initFileDirTables(Path path) throws IOException {
+ FileStatus[] stats = fs.listStatus(path);
+ if (stats != null) {
+ for (FileStatus stat : stats) {
+ if (stat.isDir()) {
+ dirs.add(stat.getPath().toString());
+ initFileDirTables(stat.getPath());
+ } else {
+ Path filePath = stat.getPath();
+ if (filePath.getName().startsWith(StructureGenerator.FILE_NAME_PREFIX)) {
+ files.add(filePath.toString());
+ }
+ }
+ }
+ }
+ }
+
+ /** Returns when the current number of seconds from the epoch equals
+ * the command line argument given by -startTime.
+ * This allows multiple instances of this program, running on clock
+ * synchronized nodes, to start at roughly the same time.
+ */
+ private void barrier() {
+ long sleepTime;
+ while ((sleepTime = startTime - System.currentTimeMillis()) > 0) {
+ try {
+ Thread.sleep(sleepTime);
+ } catch (InterruptedException ex) {
+ }
+ }
+ }
+
+ /** Create a file with a length of fileSize.
+ * The file is filled with 'a'.
+ */
+ private void genFile(Path file, long fileSize) throws IOException {
+ long startTime = System.currentTimeMillis();
+ FSDataOutputStream out = fs.create(file, true,
+ getConf().getInt("io.file.buffer.size", 4096),
+ (short)getConf().getInt("dfs.replication", 3),
+ fs.getDefaultBlockSize());
+ executionTime[CREATE] += (System.currentTimeMillis()-startTime);
+ totalNumOfOps[CREATE]++;
+
+ for (long i=0; i : maximum depth of the directory tree; default is 5.
+ -minWidth : minimum number of subdirectories per directories; default is 1
+ -maxWidth : maximum number of subdirectories per directories; default is 5
+ -numOfFiles <#OfFiles> : the total number of files; default is 10.
+ -avgFileSize : average size of blocks; default is 1.
+ -outDir : output directory; default is the current directory.
+ -seed : random number generator seed; default is the current time.
+ */
+public class StructureGenerator {
+ private int maxDepth = 5;
+ private int minWidth = 1;
+ private int maxWidth = 5;
+ private int numOfFiles = 10;
+ private double avgFileSize = 1;
+ private File outDir = DEFAULT_STRUCTURE_DIRECTORY;
+ final static private String USAGE = "java StructureGenerator\n" +
+ "-maxDepth \n" +
+ "-minWidth \n" +
+ "-maxWidth \n" +
+ "-numOfFiles <#OfFiles>\n" +
+ "-avgFileSize \n" +
+ "-outDir \n" +
+ "-seed ";
+
+ private Random r = null;
+
+ /** Default directory for storing file/directory structure */
+ final static File DEFAULT_STRUCTURE_DIRECTORY = new File(".");
+ /** The name of the file for storing directory structure */
+ final static String DIR_STRUCTURE_FILE_NAME = "dirStructure";
+ /** The name of the file for storing file structure */
+ final static String FILE_STRUCTURE_FILE_NAME = "fileStructure";
+ /** The name prefix for the files created by this program */
+ final static String FILE_NAME_PREFIX = "_file_";
+
+ /**
+ * The main function first parses the command line arguments,
+ * then generates in-memory directory structure and outputs to a file,
+ * last generates in-memory files and outputs them to a file.
+ */
+ public int run(String[] args) throws Exception {
+ int exitCode = 0;
+ exitCode = init(args);
+ if (exitCode != 0) {
+ return exitCode;
+ }
+ genDirStructure();
+ output(new File(outDir, DIR_STRUCTURE_FILE_NAME));
+ genFileStructure();
+ outputFiles(new File(outDir, FILE_STRUCTURE_FILE_NAME));
+ return exitCode;
+ }
+
+ /** Parse the command line arguments and initialize the data */
+ private int init(String[] args) {
+ try {
+ for (int i = 0; i < args.length; i++) { // parse command line
+ if (args[i].equals("-maxDepth")) {
+ maxDepth = Integer.parseInt(args[++i]);
+ if (maxDepth<1) {
+ System.err.println("maxDepth must be positive: " + maxDepth);
+ return -1;
+ }
+ } else if (args[i].equals("-minWidth")) {
+ minWidth = Integer.parseInt(args[++i]);
+ if (minWidth<0) {
+ System.err.println("minWidth must be positive: " + minWidth);
+ return -1;
+ }
+ } else if (args[i].equals("-maxWidth")) {
+ maxWidth = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-numOfFiles")) {
+ numOfFiles = Integer.parseInt(args[++i]);
+ if (numOfFiles<1) {
+ System.err.println("NumOfFiles must be positive: " + numOfFiles);
+ return -1;
+ }
+ } else if (args[i].equals("-avgFileSize")) {
+ avgFileSize = Double.parseDouble(args[++i]);
+ if (avgFileSize<=0) {
+ System.err.println("AvgFileSize must be positive: " + avgFileSize);
+ return -1;
+ }
+ } else if (args[i].equals("-outDir")) {
+ outDir = new File(args[++i]);
+ } else if (args[i].equals("-seed")) {
+ r = new Random(Long.parseLong(args[++i]));
+ } else {
+ System.err.println(USAGE);
+ ToolRunner.printGenericCommandUsage(System.err);
+ return -1;
+ }
+ }
+ } catch (NumberFormatException e) {
+ System.err.println("Illegal parameter: " + e.getLocalizedMessage());
+ System.err.println(USAGE);
+ return -1;
+ }
+
+ if (maxWidth < minWidth) {
+ System.err.println(
+ "maxWidth must be bigger than minWidth: " + maxWidth);
+ return -1;
+ }
+
+ if (r==null) {
+ r = new Random();
+ }
+ return 0;
+ }
+
+ /** In memory representation of a directory */
+ private static class INode {
+ private String name;
+ private List children = new ArrayList();
+
+ /** Constructor */
+ private INode(String name) {
+ this.name = name;
+ }
+
+ /** Add a child (subdir/file) */
+ private void addChild(INode child) {
+ children.add(child);
+ }
+
+ /** Output the subtree rooted at the current node.
+ * Only the leaves are printed.
+ */
+ private void output(PrintStream out, String prefix) {
+ prefix = prefix==null?name:prefix+"/"+name;
+ if (children.isEmpty()) {
+ out.println(prefix);
+ } else {
+ for (INode child : children) {
+ child.output(out, prefix);
+ }
+ }
+ }
+
+ /** Output the files in the subtree rooted at this node */
+ protected void outputFiles(PrintStream out, String prefix) {
+ prefix = prefix==null?name:prefix+"/"+name;
+ for (INode child : children) {
+ child.outputFiles(out, prefix);
+ }
+ }
+
+ /** Add all the leaves in the subtree to the input list */
+ private void getLeaves(List leaves) {
+ if (children.isEmpty()) {
+ leaves.add(this);
+ } else {
+ for (INode child : children) {
+ child.getLeaves(leaves);
+ }
+ }
+ }
+ }
+
+ /** In memory representation of a file */
+ private static class FileINode extends INode {
+ private double numOfBlocks;
+
+ /** constructor */
+ private FileINode(String name, double numOfBlocks) {
+ super(name);
+ this.numOfBlocks = numOfBlocks;
+ }
+
+ /** Output a file attribute */
+ protected void outputFiles(PrintStream out, String prefix) {
+ prefix = (prefix == null)?super.name: prefix + "/"+super.name;
+ out.println(prefix + " " + numOfBlocks);
+ }
+ }
+
+ private INode root;
+
+ /** Generates a directory tree with a max depth of maxDepth */
+ private void genDirStructure() {
+ root = genDirStructure("", maxDepth);
+ }
+
+ /** Generate a directory tree rooted at rootName
+ * The number of subtree is in the range of [minWidth, maxWidth].
+ * The maximum depth of each subtree is in the range of
+ * [2*maxDepth/3, maxDepth].
+ */
+ private INode genDirStructure(String rootName, int maxDepth) {
+ INode root = new INode(rootName);
+
+ if (maxDepth>0) {
+ maxDepth--;
+ int minDepth = maxDepth*2/3;
+ // Figure out the number of subdirectories to generate
+ int numOfSubDirs = minWidth + r.nextInt(maxWidth-minWidth+1);
+ // Expand the tree
+ for (int i=0; i getLeaves() {
+ List leaveDirs = new ArrayList();
+ root.getLeaves(leaveDirs);
+ return leaveDirs;
+ }
+
+ /** Decides where to place all the files and its length.
+ * It first collects all empty directories in the tree.
+ * For each file, it randomly chooses an empty directory to place the file.
+ * The file's length is generated using Gaussian distribution.
+ */
+ private void genFileStructure() {
+ List leaves = getLeaves();
+ int totalLeaves = leaves.size();
+ for (int i=0; i inodes = new TreeMap();
+ private Map blocks = new HashMap();
+
+ public void initialize(URI uri, Configuration conf) {
+ this.conf = conf;
+ }
+
+ public String getVersion() throws IOException {
+ return "0";
+ }
+
+ public void deleteINode(Path path) throws IOException {
+ inodes.remove(normalize(path));
+ }
+
+ public void deleteBlock(Block block) throws IOException {
+ blocks.remove(block.getId());
+ }
+
+ public boolean inodeExists(Path path) throws IOException {
+ return inodes.containsKey(normalize(path));
+ }
+
+ public boolean blockExists(long blockId) throws IOException {
+ return blocks.containsKey(blockId);
+ }
+
+ public INode retrieveINode(Path path) throws IOException {
+ return inodes.get(normalize(path));
+ }
+
+ public File retrieveBlock(Block block, long byteRangeStart) throws IOException {
+ byte[] data = blocks.get(block.getId());
+ File file = createTempFile();
+ BufferedOutputStream out = null;
+ try {
+ out = new BufferedOutputStream(new FileOutputStream(file));
+ out.write(data, (int) byteRangeStart, data.length - (int) byteRangeStart);
+ } finally {
+ if (out != null) {
+ out.close();
+ }
+ }
+ return file;
+ }
+
+ private File createTempFile() throws IOException {
+ File dir = new File(conf.get("fs.s3.buffer.dir"));
+ if (!dir.exists() && !dir.mkdirs()) {
+ throw new IOException("Cannot create S3 buffer directory: " + dir);
+ }
+ File result = File.createTempFile("test-", ".tmp", dir);
+ result.deleteOnExit();
+ return result;
+ }
+
+ public Set listSubPaths(Path path) throws IOException {
+ Path normalizedPath = normalize(path);
+ // This is inefficient but more than adequate for testing purposes.
+ Set subPaths = new LinkedHashSet();
+ for (Path p : inodes.tailMap(normalizedPath).keySet()) {
+ if (normalizedPath.equals(p.getParent())) {
+ subPaths.add(p);
+ }
+ }
+ return subPaths;
+ }
+
+ public Set listDeepSubPaths(Path path) throws IOException {
+ Path normalizedPath = normalize(path);
+ String pathString = normalizedPath.toUri().getPath();
+ if (!pathString.endsWith("/")) {
+ pathString += "/";
+ }
+ // This is inefficient but more than adequate for testing purposes.
+ Set subPaths = new LinkedHashSet();
+ for (Path p : inodes.tailMap(normalizedPath).keySet()) {
+ if (p.toUri().getPath().startsWith(pathString)) {
+ subPaths.add(p);
+ }
+ }
+ return subPaths;
+ }
+
+ public void storeINode(Path path, INode inode) throws IOException {
+ inodes.put(normalize(path), inode);
+ }
+
+ public void storeBlock(Block block, File file) throws IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ byte[] buf = new byte[8192];
+ int numRead;
+ BufferedInputStream in = null;
+ try {
+ in = new BufferedInputStream(new FileInputStream(file));
+ while ((numRead = in.read(buf)) >= 0) {
+ out.write(buf, 0, numRead);
+ }
+ } finally {
+ if (in != null) {
+ in.close();
+ }
+ }
+ blocks.put(block.getId(), out.toByteArray());
+ }
+
+ private Path normalize(Path path) {
+ if (!path.isAbsolute()) {
+ throw new IllegalArgumentException("Path must be absolute: " + path);
+ }
+ return new Path(path.toUri().getPath());
+ }
+
+ public void purge() throws IOException {
+ inodes.clear();
+ blocks.clear();
+ }
+
+ public void dump() throws IOException {
+ StringBuilder sb = new StringBuilder(getClass().getSimpleName());
+ sb.append(", \n");
+ for (Map.Entry entry : inodes.entrySet()) {
+ sb.append(entry.getKey()).append("\n");
+ INode inode = entry.getValue();
+ sb.append("\t").append(inode.getFileType()).append("\n");
+ if (inode.getFileType() == FileType.DIRECTORY) {
+ continue;
+ }
+ for (int j = 0; j < inode.getBlocks().length; j++) {
+ sb.append("\t").append(inode.getBlocks()[j]).append("\n");
+ }
+ }
+ System.out.println(sb);
+
+ System.out.println(inodes.keySet());
+ System.out.println(blocks.keySet());
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/s3/Jets3tS3FileSystemContractTest.java b/src/test/org/apache/hadoop/fs/s3/Jets3tS3FileSystemContractTest.java
new file mode 100644
index 00000000000..53b3c03c414
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3/Jets3tS3FileSystemContractTest.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3;
+
+import java.io.IOException;
+
+public class Jets3tS3FileSystemContractTest
+ extends S3FileSystemContractBaseTest {
+
+ @Override
+ FileSystemStore getFileSystemStore() throws IOException {
+ return new Jets3tFileSystemStore();
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java b/src/test/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java
new file mode 100644
index 00000000000..8d6744a12a3
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3/S3FileSystemContractBaseTest.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3;
+
+import java.io.IOException;
+import java.net.URI;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystemContractBaseTest;
+
+public abstract class S3FileSystemContractBaseTest
+ extends FileSystemContractBaseTest {
+
+ private FileSystemStore store;
+
+ abstract FileSystemStore getFileSystemStore() throws IOException;
+
+ @Override
+ protected void setUp() throws Exception {
+ Configuration conf = new Configuration();
+ store = getFileSystemStore();
+ fs = new S3FileSystem(store);
+ fs.initialize(URI.create(conf.get("test.fs.s3.name")), conf);
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ store.purge();
+ super.tearDown();
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/s3/TestINode.java b/src/test/org/apache/hadoop/fs/s3/TestINode.java
new file mode 100644
index 00000000000..086a43eabca
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3/TestINode.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.fs.s3.INode.FileType;
+
+public class TestINode extends TestCase {
+
+ public void testSerializeFileWithSingleBlock() throws IOException {
+ Block[] blocks = { new Block(849282477840258181L, 128L) };
+ INode inode = new INode(FileType.FILE, blocks);
+
+ assertEquals("Length", 1L + 4 + 16, inode.getSerializedLength());
+ InputStream in = inode.serialize();
+
+ INode deserialized = INode.deserialize(in);
+
+ assertEquals("FileType", inode.getFileType(), deserialized.getFileType());
+ Block[] deserializedBlocks = deserialized.getBlocks();
+ assertEquals("Length", 1, deserializedBlocks.length);
+ assertEquals("Id", blocks[0].getId(), deserializedBlocks[0].getId());
+ assertEquals("Length", blocks[0].getLength(), deserializedBlocks[0]
+ .getLength());
+
+ }
+
+ public void testSerializeDirectory() throws IOException {
+ INode inode = INode.DIRECTORY_INODE;
+ assertEquals("Length", 1L, inode.getSerializedLength());
+ InputStream in = inode.serialize();
+ INode deserialized = INode.deserialize(in);
+ assertSame(INode.DIRECTORY_INODE, deserialized);
+ }
+
+ public void testDeserializeNull() throws IOException {
+ assertNull(INode.deserialize(null));
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/s3/TestInMemoryS3FileSystemContract.java b/src/test/org/apache/hadoop/fs/s3/TestInMemoryS3FileSystemContract.java
new file mode 100644
index 00000000000..5d66cf12c85
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3/TestInMemoryS3FileSystemContract.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3;
+
+import java.io.IOException;
+
+public class TestInMemoryS3FileSystemContract
+ extends S3FileSystemContractBaseTest {
+
+ @Override
+ FileSystemStore getFileSystemStore() throws IOException {
+ return new InMemoryFileSystemStore();
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/s3/TestS3Credentials.java b/src/test/org/apache/hadoop/fs/s3/TestS3Credentials.java
new file mode 100644
index 00000000000..bcbf0dc607a
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3/TestS3Credentials.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs.s3;
+
+import java.net.URI;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+
+public class TestS3Credentials extends TestCase {
+ public void testInvalidHostnameWithUnderscores() throws Exception {
+ S3Credentials s3Credentials = new S3Credentials();
+ try {
+ s3Credentials.initialize(new URI("s3://a:b@c_d"), new Configuration());
+ fail("Should throw IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ assertEquals("Invalid hostname in URI s3://a:b@c_d", e.getMessage());
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/s3/TestS3FileSystem.java b/src/test/org/apache/hadoop/fs/s3/TestS3FileSystem.java
new file mode 100644
index 00000000000..f21989c5d97
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3/TestS3FileSystem.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3;
+
+import java.io.IOException;
+import java.net.URI;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+
+public class TestS3FileSystem extends TestCase {
+
+ public void testInitialization() throws IOException {
+ initializationTest("s3://a:b@c", "s3://a:b@c");
+ initializationTest("s3://a:b@c/", "s3://a:b@c");
+ initializationTest("s3://a:b@c/path", "s3://a:b@c");
+ initializationTest("s3://a@c", "s3://a@c");
+ initializationTest("s3://a@c/", "s3://a@c");
+ initializationTest("s3://a@c/path", "s3://a@c");
+ initializationTest("s3://c", "s3://c");
+ initializationTest("s3://c/", "s3://c");
+ initializationTest("s3://c/path", "s3://c");
+ }
+
+ private void initializationTest(String initializationUri, String expectedUri)
+ throws IOException {
+
+ S3FileSystem fs = new S3FileSystem(new InMemoryFileSystemStore());
+ fs.initialize(URI.create(initializationUri), new Configuration());
+ assertEquals(URI.create(expectedUri), fs.getUri());
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/s3native/InMemoryNativeFileSystemStore.java b/src/test/org/apache/hadoop/fs/s3native/InMemoryNativeFileSystemStore.java
new file mode 100644
index 00000000000..d3086da9e82
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3native/InMemoryNativeFileSystemStore.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3native;
+
+import static org.apache.hadoop.fs.s3native.NativeS3FileSystem.PATH_DELIMITER;
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ *
+ * A stub implementation of {@link NativeFileSystemStore} for testing
+ * {@link NativeS3FileSystem} without actually connecting to S3.
+ *
+ */
+class InMemoryNativeFileSystemStore implements NativeFileSystemStore {
+
+ private Configuration conf;
+
+ private SortedMap metadataMap =
+ new TreeMap();
+ private SortedMap dataMap = new TreeMap();
+
+ public void initialize(URI uri, Configuration conf) throws IOException {
+ this.conf = conf;
+ }
+
+ public void storeEmptyFile(String key) throws IOException {
+ metadataMap.put(key, new FileMetadata(key, 0, System.currentTimeMillis()));
+ dataMap.put(key, new byte[0]);
+ }
+
+ public void storeFile(String key, File file, byte[] md5Hash)
+ throws IOException {
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ byte[] buf = new byte[8192];
+ int numRead;
+ BufferedInputStream in = null;
+ try {
+ in = new BufferedInputStream(new FileInputStream(file));
+ while ((numRead = in.read(buf)) >= 0) {
+ out.write(buf, 0, numRead);
+ }
+ } finally {
+ if (in != null) {
+ in.close();
+ }
+ }
+ metadataMap.put(key,
+ new FileMetadata(key, file.length(), System.currentTimeMillis()));
+ dataMap.put(key, out.toByteArray());
+ }
+
+ public InputStream retrieve(String key) throws IOException {
+ return retrieve(key, 0);
+ }
+
+ public InputStream retrieve(String key, long byteRangeStart)
+ throws IOException {
+
+ byte[] data = dataMap.get(key);
+ File file = createTempFile();
+ BufferedOutputStream out = null;
+ try {
+ out = new BufferedOutputStream(new FileOutputStream(file));
+ out.write(data, (int) byteRangeStart,
+ data.length - (int) byteRangeStart);
+ } finally {
+ if (out != null) {
+ out.close();
+ }
+ }
+ return new FileInputStream(file);
+ }
+
+ private File createTempFile() throws IOException {
+ File dir = new File(conf.get("fs.s3.buffer.dir"));
+ if (!dir.exists() && !dir.mkdirs()) {
+ throw new IOException("Cannot create S3 buffer directory: " + dir);
+ }
+ File result = File.createTempFile("test-", ".tmp", dir);
+ result.deleteOnExit();
+ return result;
+ }
+
+ public FileMetadata retrieveMetadata(String key) throws IOException {
+ return metadataMap.get(key);
+ }
+
+ public PartialListing list(String prefix, int maxListingLength)
+ throws IOException {
+ return list(prefix, maxListingLength, null);
+ }
+
+ public PartialListing list(String prefix, int maxListingLength,
+ String priorLastKey) throws IOException {
+
+ return list(prefix, PATH_DELIMITER, maxListingLength, priorLastKey);
+ }
+
+ public PartialListing listAll(String prefix, int maxListingLength,
+ String priorLastKey) throws IOException {
+
+ return list(prefix, null, maxListingLength, priorLastKey);
+ }
+
+ private PartialListing list(String prefix, String delimiter,
+ int maxListingLength, String priorLastKey) throws IOException {
+
+ if (prefix.length() > 0 && !prefix.endsWith(PATH_DELIMITER)) {
+ prefix += PATH_DELIMITER;
+ }
+
+ List metadata = new ArrayList();
+ SortedSet commonPrefixes = new TreeSet();
+ for (String key : dataMap.keySet()) {
+ if (key.startsWith(prefix)) {
+ if (delimiter == null) {
+ metadata.add(retrieveMetadata(key));
+ } else {
+ int delimIndex = key.indexOf(delimiter, prefix.length());
+ if (delimIndex == -1) {
+ metadata.add(retrieveMetadata(key));
+ } else {
+ String commonPrefix = key.substring(0, delimIndex);
+ commonPrefixes.add(commonPrefix);
+ }
+ }
+ }
+ if (metadata.size() + commonPrefixes.size() == maxListingLength) {
+ new PartialListing(key, metadata.toArray(new FileMetadata[0]),
+ commonPrefixes.toArray(new String[0]));
+ }
+ }
+ return new PartialListing(null, metadata.toArray(new FileMetadata[0]),
+ commonPrefixes.toArray(new String[0]));
+ }
+
+ public void delete(String key) throws IOException {
+ metadataMap.remove(key);
+ dataMap.remove(key);
+ }
+
+ public void rename(String srcKey, String dstKey) throws IOException {
+ metadataMap.put(dstKey, metadataMap.remove(srcKey));
+ dataMap.put(dstKey, dataMap.remove(srcKey));
+ }
+
+ public void purge(String prefix) throws IOException {
+ Iterator> i =
+ metadataMap.entrySet().iterator();
+ while (i.hasNext()) {
+ Entry entry = i.next();
+ if (entry.getKey().startsWith(prefix)) {
+ dataMap.remove(entry.getKey());
+ i.remove();
+ }
+ }
+ }
+
+ public void dump() throws IOException {
+ System.out.println(metadataMap.values());
+ System.out.println(dataMap.keySet());
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/s3native/Jets3tNativeS3FileSystemContractTest.java b/src/test/org/apache/hadoop/fs/s3native/Jets3tNativeS3FileSystemContractTest.java
new file mode 100644
index 00000000000..6516c836f88
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3native/Jets3tNativeS3FileSystemContractTest.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3native;
+
+import java.io.IOException;
+
+public class Jets3tNativeS3FileSystemContractTest
+ extends NativeS3FileSystemContractBaseTest {
+
+ @Override
+ NativeFileSystemStore getNativeFileSystemStore() throws IOException {
+ return new Jets3tNativeFileSystemStore();
+ }
+}
diff --git a/src/test/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java b/src/test/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java
new file mode 100644
index 00000000000..bf2e3c3d387
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3native/NativeS3FileSystemContractBaseTest.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3native;
+
+import java.io.IOException;
+import java.net.URI;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystemContractBaseTest;
+import org.apache.hadoop.fs.Path;
+
+public abstract class NativeS3FileSystemContractBaseTest
+ extends FileSystemContractBaseTest {
+
+ private NativeFileSystemStore store;
+
+ abstract NativeFileSystemStore getNativeFileSystemStore() throws IOException;
+
+ @Override
+ protected void setUp() throws Exception {
+ Configuration conf = new Configuration();
+ store = getNativeFileSystemStore();
+ fs = new NativeS3FileSystem(store);
+ fs.initialize(URI.create(conf.get("test.fs.s3n.name")), conf);
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ store.purge("test");
+ super.tearDown();
+ }
+
+ public void testListStatusForRoot() throws Exception {
+ Path testDir = path("/test");
+ assertTrue(fs.mkdirs(testDir));
+
+ FileStatus[] paths = fs.listStatus(path("/"));
+ assertEquals(1, paths.length);
+ assertEquals(path("/test"), paths[0].getPath());
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/fs/s3native/TestInMemoryNativeS3FileSystemContract.java b/src/test/org/apache/hadoop/fs/s3native/TestInMemoryNativeS3FileSystemContract.java
new file mode 100644
index 00000000000..664d39e6f4f
--- /dev/null
+++ b/src/test/org/apache/hadoop/fs/s3native/TestInMemoryNativeS3FileSystemContract.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3native;
+
+import java.io.IOException;
+
+public class TestInMemoryNativeS3FileSystemContract
+ extends NativeS3FileSystemContractBaseTest {
+
+ @Override
+ NativeFileSystemStore getNativeFileSystemStore() throws IOException {
+ return new InMemoryNativeFileSystemStore();
+ }
+}
diff --git a/src/test/org/apache/hadoop/http/TestGlobalFilter.java b/src/test/org/apache/hadoop/http/TestGlobalFilter.java
new file mode 100644
index 00000000000..51ab60697f2
--- /dev/null
+++ b/src/test/org/apache/hadoop/http/TestGlobalFilter.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.http;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Set;
+import java.util.TreeSet;
+
+import javax.servlet.Filter;
+import javax.servlet.FilterChain;
+import javax.servlet.FilterConfig;
+import javax.servlet.ServletException;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.http.HttpServletRequest;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestGlobalFilter extends junit.framework.TestCase {
+ static final Log LOG = LogFactory.getLog(HttpServer.class);
+ static final Set RECORDS = new TreeSet();
+
+ /** A very simple filter that records accessed uri's */
+ static public class RecordingFilter implements Filter {
+ private FilterConfig filterConfig = null;
+
+ public void init(FilterConfig filterConfig) {
+ this.filterConfig = filterConfig;
+ }
+
+ public void destroy() {
+ this.filterConfig = null;
+ }
+
+ public void doFilter(ServletRequest request, ServletResponse response,
+ FilterChain chain) throws IOException, ServletException {
+ if (filterConfig == null)
+ return;
+
+ String uri = ((HttpServletRequest)request).getRequestURI();
+ LOG.info("filtering " + uri);
+ RECORDS.add(uri);
+ chain.doFilter(request, response);
+ }
+
+ /** Configuration for RecordingFilter */
+ static public class Initializer extends FilterInitializer {
+ public Initializer() {}
+
+ void initFilter(FilterContainer container) {
+ container.addGlobalFilter("recording", RecordingFilter.class.getName(), null);
+ }
+ }
+ }
+
+
+ /** access a url, ignoring some IOException such as the page does not exist */
+ static void access(String urlstring) throws IOException {
+ LOG.warn("access " + urlstring);
+ URL url = new URL(urlstring);
+ URLConnection connection = url.openConnection();
+ connection.connect();
+
+ try {
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ try {
+ for(; in.readLine() != null; );
+ } finally {
+ in.close();
+ }
+ } catch(IOException ioe) {
+ LOG.warn("urlstring=" + urlstring, ioe);
+ }
+ }
+
+ public void testServletFilter() throws Exception {
+ Configuration conf = new Configuration();
+
+ //start a http server with CountingFilter
+ conf.set(HttpServer.FILTER_INITIALIZER_PROPERTY,
+ RecordingFilter.Initializer.class.getName());
+ HttpServer http = new HttpServer("datanode", "localhost", 0, true, conf);
+ http.start();
+
+ final String fsckURL = "/fsck";
+ final String stacksURL = "/stacks";
+ final String ajspURL = "/a.jsp";
+ final String listPathsURL = "/listPaths";
+ final String dataURL = "/data";
+ final String streamFile = "/streamFile";
+ final String rootURL = "/";
+ final String allURL = "/*";
+ final String outURL = "/static/a.out";
+ final String logURL = "/logs/a.log";
+
+ final String[] urls = {fsckURL, stacksURL, ajspURL, listPathsURL,
+ dataURL, streamFile, rootURL, allURL, outURL, logURL};
+
+ //access the urls
+ final String prefix = "http://localhost:" + http.getPort();
+ try {
+ for(int i = 0; i < urls.length; i++) {
+ access(prefix + urls[i]);
+ }
+ } finally {
+ http.stop();
+ }
+
+ LOG.info("RECORDS = " + RECORDS);
+
+ //verify records
+ for(int i = 0; i < urls.length; i++) {
+ assertTrue(RECORDS.remove(urls[i]));
+ }
+ assertTrue(RECORDS.isEmpty());
+ }
+}
diff --git a/src/test/org/apache/hadoop/http/TestServletFilter.java b/src/test/org/apache/hadoop/http/TestServletFilter.java
new file mode 100644
index 00000000000..8052f9ad492
--- /dev/null
+++ b/src/test/org/apache/hadoop/http/TestServletFilter.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.http;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Random;
+
+import javax.servlet.Filter;
+import javax.servlet.FilterChain;
+import javax.servlet.FilterConfig;
+import javax.servlet.ServletException;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.http.HttpServletRequest;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestServletFilter extends junit.framework.TestCase {
+ static final Log LOG = LogFactory.getLog(HttpServer.class);
+ static volatile String uri = null;
+
+ /** A very simple filter which record the uri filtered. */
+ static public class SimpleFilter implements Filter {
+ private FilterConfig filterConfig = null;
+
+ public void init(FilterConfig filterConfig) {
+ this.filterConfig = filterConfig;
+ }
+
+ public void destroy() {
+ this.filterConfig = null;
+ }
+
+ public void doFilter(ServletRequest request, ServletResponse response,
+ FilterChain chain) throws IOException, ServletException {
+ if (filterConfig == null)
+ return;
+
+ uri = ((HttpServletRequest)request).getRequestURI();
+ LOG.info("filtering " + uri);
+ chain.doFilter(request, response);
+ }
+
+ /** Configuration for the filter */
+ static public class Initializer extends FilterInitializer {
+ public Initializer() {}
+
+ void initFilter(FilterContainer container) {
+ container.addFilter("simple", SimpleFilter.class.getName(), null);
+ }
+ }
+ }
+
+
+ /** access a url, ignoring some IOException such as the page does not exist */
+ static void access(String urlstring) throws IOException {
+ LOG.warn("access " + urlstring);
+ URL url = new URL(urlstring);
+ URLConnection connection = url.openConnection();
+ connection.connect();
+
+ try {
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ try {
+ for(; in.readLine() != null; );
+ } finally {
+ in.close();
+ }
+ } catch(IOException ioe) {
+ LOG.warn("urlstring=" + urlstring, ioe);
+ }
+ }
+
+ public void testServletFilter() throws Exception {
+ Configuration conf = new Configuration();
+
+ //start a http server with CountingFilter
+ conf.set(HttpServer.FILTER_INITIALIZER_PROPERTY,
+ SimpleFilter.Initializer.class.getName());
+ HttpServer http = new HttpServer("datanode", "localhost", 0, true, conf);
+ http.start();
+
+ final String fsckURL = "/fsck";
+ final String stacksURL = "/stacks";
+ final String ajspURL = "/a.jsp";
+ final String logURL = "/logs/a.log";
+ final String hadooplogoURL = "/static/hadoop-logo.jpg";
+
+ final String[] urls = {fsckURL, stacksURL, ajspURL, logURL, hadooplogoURL};
+ final Random ran = new Random();
+ final int[] sequence = new int[50];
+
+ //generate a random sequence and update counts
+ for(int i = 0; i < sequence.length; i++) {
+ sequence[i] = ran.nextInt(urls.length);
+ }
+
+ //access the urls as the sequence
+ final String prefix = "http://localhost:" + http.getPort();
+ try {
+ for(int i = 0; i < sequence.length; i++) {
+ access(prefix + urls[sequence[i]]);
+
+ //make sure everything except fsck get filtered
+ if (sequence[i] == 0) {
+ assertEquals(null, uri);
+ } else {
+ assertEquals(urls[sequence[i]], uri);
+ uri = null;
+ }
+ }
+ } finally {
+ http.stop();
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/RandomDatum.java b/src/test/org/apache/hadoop/io/RandomDatum.java
new file mode 100644
index 00000000000..ab8f34febab
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/RandomDatum.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.util.*;
+import java.io.*;
+
+public class RandomDatum implements WritableComparable {
+ private int length;
+ private byte[] data;
+
+ public RandomDatum() {}
+
+ public RandomDatum(Random random) {
+ length = 10 + (int) Math.pow(10.0, random.nextFloat() * 3.0);
+ data = new byte[length];
+ random.nextBytes(data);
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(length);
+ out.write(data);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ length = in.readInt();
+ if (data == null || length > data.length)
+ data = new byte[length];
+ in.readFully(data, 0, length);
+ }
+
+ public int compareTo(Object o) {
+ RandomDatum that = (RandomDatum)o;
+ return WritableComparator.compareBytes(this.data, 0, this.length,
+ that.data, 0, that.length);
+ }
+
+ public boolean equals(Object o) {
+ return compareTo(o) == 0;
+ }
+
+ private static final char[] HEX_DIGITS =
+ {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
+
+ /** Returns a string representation of this object. */
+ public String toString() {
+ StringBuffer buf = new StringBuffer(length*2);
+ for (int i = 0; i < length; i++) {
+ int b = data[i];
+ buf.append(HEX_DIGITS[(b >> 4) & 0xf]);
+ buf.append(HEX_DIGITS[b & 0xf]);
+ }
+ return buf.toString();
+ }
+
+ public static class Generator {
+ Random random;
+
+ private RandomDatum key;
+ private RandomDatum value;
+
+ public Generator() { random = new Random(); }
+ public Generator(int seed) { random = new Random(seed); }
+
+ public RandomDatum getKey() { return key; }
+ public RandomDatum getValue() { return value; }
+
+ public void next() {
+ key = new RandomDatum(random);
+ value = new RandomDatum(random);
+ }
+ }
+
+ /** A WritableComparator optimized for RandomDatum. */
+ public static class Comparator extends WritableComparator {
+ public Comparator() {
+ super(RandomDatum.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1,
+ byte[] b2, int s2, int l2) {
+ int n1 = readInt(b1, s1);
+ int n2 = readInt(b2, s2);
+ return compareBytes(b1, s1+4, n1, b2, s2+4, n2);
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/TestArrayFile.java b/src/test/org/apache/hadoop/io/TestArrayFile.java
new file mode 100644
index 00000000000..f279bd74319
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestArrayFile.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.*;
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.*;
+
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+
+/** Support for flat files of binary key/value pairs. */
+public class TestArrayFile extends TestCase {
+ private static final Log LOG = LogFactory.getLog(TestArrayFile.class);
+ private static String FILE =
+ System.getProperty("test.build.data",".") + "/test.array";
+
+ public TestArrayFile(String name) {
+ super(name);
+ }
+
+ public void testArrayFile() throws Exception {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.getLocal(conf);
+ RandomDatum[] data = generate(10000);
+ writeTest(fs, data, FILE);
+ readTest(fs, data, FILE, conf);
+ }
+
+ public void testEmptyFile() throws Exception {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.getLocal(conf);
+ writeTest(fs, new RandomDatum[0], FILE);
+ ArrayFile.Reader reader = new ArrayFile.Reader(fs, FILE, conf);
+ assertNull(reader.get(0, new RandomDatum()));
+ reader.close();
+ }
+
+ private static RandomDatum[] generate(int count) {
+ LOG.debug("generating " + count + " records in debug");
+ RandomDatum[] data = new RandomDatum[count];
+ RandomDatum.Generator generator = new RandomDatum.Generator();
+ for (int i = 0; i < count; i++) {
+ generator.next();
+ data[i] = generator.getValue();
+ }
+ return data;
+ }
+
+ private static void writeTest(FileSystem fs, RandomDatum[] data, String file)
+ throws IOException {
+ Configuration conf = new Configuration();
+ MapFile.delete(fs, file);
+ LOG.debug("creating with " + data.length + " debug");
+ ArrayFile.Writer writer = new ArrayFile.Writer(conf, fs, file, RandomDatum.class);
+ writer.setIndexInterval(100);
+ for (int i = 0; i < data.length; i++)
+ writer.append(data[i]);
+ writer.close();
+ }
+
+ private static void readTest(FileSystem fs, RandomDatum[] data, String file, Configuration conf)
+ throws IOException {
+ RandomDatum v = new RandomDatum();
+ LOG.debug("reading " + data.length + " debug");
+ ArrayFile.Reader reader = new ArrayFile.Reader(fs, file, conf);
+ for (int i = 0; i < data.length; i++) { // try forwards
+ reader.get(i, v);
+ if (!v.equals(data[i])) {
+ throw new RuntimeException("wrong value at " + i);
+ }
+ }
+ for (int i = data.length-1; i >= 0; i--) { // then backwards
+ reader.get(i, v);
+ if (!v.equals(data[i])) {
+ throw new RuntimeException("wrong value at " + i);
+ }
+ }
+ reader.close();
+ LOG.debug("done reading " + data.length + " debug");
+ }
+
+
+ /** For debugging and testing. */
+ public static void main(String[] args) throws Exception {
+ int count = 1024 * 1024;
+ boolean create = true;
+ boolean check = true;
+ String file = FILE;
+ String usage = "Usage: TestArrayFile [-count N] [-nocreate] [-nocheck] file";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ Configuration conf = new Configuration();
+ int i = 0;
+ Path fpath = null;
+ FileSystem fs = null;
+ try {
+ for (; i < args.length; i++) { // parse command line
+ if (args[i] == null) {
+ continue;
+ } else if (args[i].equals("-count")) {
+ count = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-nocreate")) {
+ create = false;
+ } else if (args[i].equals("-nocheck")) {
+ check = false;
+ } else {
+ // file is required parameter
+ file = args[i];
+ fpath=new Path(file);
+ }
+ }
+
+ fs = fpath.getFileSystem(conf);
+
+ LOG.info("count = " + count);
+ LOG.info("create = " + create);
+ LOG.info("check = " + check);
+ LOG.info("file = " + file);
+
+ RandomDatum[] data = generate(count);
+
+ if (create) {
+ writeTest(fs, data, file);
+ }
+
+ if (check) {
+ readTest(fs, data, file, conf);
+ }
+ } finally {
+ fs.close();
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestArrayWritable.java b/src/test/org/apache/hadoop/io/TestArrayWritable.java
new file mode 100644
index 00000000000..47d0ce9f635
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestArrayWritable.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.*;
+
+import junit.framework.TestCase;
+
+/** Unit tests for ArrayWritable */
+public class TestArrayWritable extends TestCase {
+
+ static class TextArrayWritable extends ArrayWritable {
+ public TextArrayWritable() {
+ super(Text.class);
+ }
+ }
+
+ public TestArrayWritable(String name) {
+ super(name);
+ }
+
+ /**
+ * If valueClass is undefined, readFields should throw an exception indicating
+ * that the field is null. Otherwise, readFields should succeed.
+ */
+ public void testThrowUndefinedValueException() throws IOException {
+ // Get a buffer containing a simple text array
+ Text[] elements = {new Text("zero"), new Text("one"), new Text("two")};
+ TextArrayWritable sourceArray = new TextArrayWritable();
+ sourceArray.set(elements);
+
+ // Write it to a normal output buffer
+ DataOutputBuffer out = new DataOutputBuffer();
+ DataInputBuffer in = new DataInputBuffer();
+ sourceArray.write(out);
+
+ // Read the output buffer with TextReadable. Since the valueClass is defined,
+ // this should succeed
+ TextArrayWritable destArray = new TextArrayWritable();
+ in.reset(out.getData(), out.getLength());
+ destArray.readFields(in);
+ Writable[] destElements = destArray.get();
+ assertTrue(destElements.length == elements.length);
+ for (int i = 0; i < elements.length; i++) {
+ assertEquals(destElements[i],elements[i]);
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestBloomMapFile.java b/src/test/org/apache/hadoop/io/TestBloomMapFile.java
new file mode 100644
index 00000000000..2a7d22455f6
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestBloomMapFile.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import junit.framework.TestCase;
+
+public class TestBloomMapFile extends TestCase {
+ private static Configuration conf = new Configuration();
+
+ public void testMembershipTest() throws Exception {
+ // write the file
+ Path dirName = new Path(System.getProperty("test.build.data",".") +
+ getName() + ".bloommapfile");
+ FileSystem fs = FileSystem.getLocal(conf);
+ Path qualifiedDirName = fs.makeQualified(dirName);
+ conf.setInt("io.mapfile.bloom.size", 2048);
+ BloomMapFile.Writer writer = new BloomMapFile.Writer(conf, fs,
+ qualifiedDirName.toString(), IntWritable.class, Text.class);
+ IntWritable key = new IntWritable();
+ Text value = new Text();
+ for (int i = 0; i < 2000; i += 2) {
+ key.set(i);
+ value.set("00" + i);
+ writer.append(key, value);
+ }
+ writer.close();
+
+ BloomMapFile.Reader reader = new BloomMapFile.Reader(fs,
+ qualifiedDirName.toString(), conf);
+ // check false positives rate
+ int falsePos = 0;
+ int falseNeg = 0;
+ for (int i = 0; i < 2000; i++) {
+ key.set(i);
+ boolean exists = reader.probablyHasKey(key);
+ if (i % 2 == 0) {
+ if (!exists) falseNeg++;
+ } else {
+ if (exists) falsePos++;
+ }
+ }
+ reader.close();
+ fs.delete(qualifiedDirName, true);
+ System.out.println("False negatives: " + falseNeg);
+ assertEquals(0, falseNeg);
+ System.out.println("False positives: " + falsePos);
+ assertTrue(falsePos < 2);
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/TestBytesWritable.java b/src/test/org/apache/hadoop/io/TestBytesWritable.java
new file mode 100644
index 00000000000..35e0d0ed827
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestBytesWritable.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+/**
+ * This is the unit test for BytesWritable.
+ */
+public class TestBytesWritable extends TestCase {
+
+ public void testSizeChange() throws Exception {
+ byte[] hadoop = "hadoop".getBytes();
+ BytesWritable buf = new BytesWritable(hadoop);
+ int size = buf.getLength();
+ int orig_capacity = buf.getCapacity();
+ buf.setSize(size*2);
+ int new_capacity = buf.getCapacity();
+ System.arraycopy(buf.getBytes(), 0, buf.getBytes(), size, size);
+ assertTrue(new_capacity >= size * 2);
+ assertEquals(size * 2, buf.getLength());
+ assertTrue(new_capacity != orig_capacity);
+ buf.setSize(size*4);
+ assertTrue(new_capacity != buf.getCapacity());
+ for(int i=0; i < size*2; ++i) {
+ assertEquals(hadoop[i%size], buf.getBytes()[i]);
+ }
+ // shrink the buffer
+ buf.setCapacity(1);
+ // make sure the size has been cut down too
+ assertEquals(1, buf.getLength());
+ // but that the data is still there
+ assertEquals(hadoop[0], buf.getBytes()[0]);
+ }
+
+ public void testHash() throws Exception {
+ byte[] owen = "owen".getBytes();
+ BytesWritable buf = new BytesWritable(owen);
+ assertEquals(4347922, buf.hashCode());
+ buf.setCapacity(10000);
+ assertEquals(4347922, buf.hashCode());
+ buf.setSize(0);
+ assertEquals(1, buf.hashCode());
+ }
+
+ public void testCompare() throws Exception {
+ byte[][] values = new byte[][]{"abc".getBytes(),
+ "ad".getBytes(),
+ "abcd".getBytes(),
+ "".getBytes(),
+ "b".getBytes()};
+ BytesWritable[] buf = new BytesWritable[values.length];
+ for(int i=0; i < values.length; ++i) {
+ buf[i] = new BytesWritable(values[i]);
+ }
+ // check to make sure the compare function is symetric and reflexive
+ for(int i=0; i < values.length; ++i) {
+ for(int j=0; j < values.length; ++j) {
+ assertTrue(buf[i].compareTo(buf[j]) == -buf[j].compareTo(buf[i]));
+ assertTrue((i == j) == (buf[i].compareTo(buf[j]) == 0));
+ }
+ }
+ assertTrue(buf[0].compareTo(buf[1]) < 0);
+ assertTrue(buf[1].compareTo(buf[2]) > 0);
+ assertTrue(buf[2].compareTo(buf[3]) > 0);
+ assertTrue(buf[3].compareTo(buf[4]) < 0);
+ }
+
+ private void checkToString(byte[] input, String expected) {
+ String actual = new BytesWritable(input).toString();
+ assertEquals(expected, actual);
+ }
+
+ public void testToString() {
+ checkToString(new byte[]{0,1,2,0x10}, "00 01 02 10");
+ checkToString(new byte[]{-0x80, -0x7f, -0x1, -0x2, 1, 0},
+ "80 81 ff fe 01 00");
+ }
+}
+
diff --git a/src/test/org/apache/hadoop/io/TestDefaultStringifier.java b/src/test/org/apache/hadoop/io/TestDefaultStringifier.java
new file mode 100644
index 00000000000..c96cc732938
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestDefaultStringifier.java
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.IOException;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class TestDefaultStringifier extends TestCase {
+
+ private static Configuration conf = new Configuration();
+ private static final Log LOG = LogFactory.getLog(TestDefaultStringifier.class);
+
+ private char[] alphabet = "abcdefghijklmnopqrstuvwxyz".toCharArray();
+
+ public void testWithWritable() throws Exception {
+
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.WritableSerialization");
+
+ LOG.info("Testing DefaultStringifier with Text");
+
+ Random random = new Random();
+
+ //test with a Text
+ for(int i=0;i<10;i++) {
+ //generate a random string
+ StringBuilder builder = new StringBuilder();
+ int strLen = random.nextInt(40);
+ for(int j=0; j< strLen; j++) {
+ builder.append(alphabet[random.nextInt(alphabet.length)]);
+ }
+ Text text = new Text(builder.toString());
+ DefaultStringifier stringifier = new DefaultStringifier(conf, Text.class);
+
+ String str = stringifier.toString(text);
+ Text claimedText = stringifier.fromString(str);
+ LOG.info("Object: " + text);
+ LOG.info("String representation of the object: " + str);
+ assertEquals(text, claimedText);
+ }
+ }
+
+ public void testWithJavaSerialization() throws Exception {
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization");
+
+ LOG.info("Testing DefaultStringifier with Serializable Integer");
+
+ //Integer implements Serializable
+ Integer testInt = Integer.valueOf(42);
+ DefaultStringifier stringifier = new DefaultStringifier(conf, Integer.class);
+
+ String str = stringifier.toString(testInt);
+ Integer claimedInt = stringifier.fromString(str);
+ LOG.info("String representation of the object: " + str);
+
+ assertEquals(testInt, claimedInt);
+ }
+
+ public void testStoreLoad() throws IOException {
+
+ LOG.info("Testing DefaultStringifier#store() and #load()");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.WritableSerialization");
+ Text text = new Text("uninteresting test string");
+ String keyName = "test.defaultstringifier.key1";
+
+ DefaultStringifier.store(conf,text, keyName);
+
+ Text claimedText = DefaultStringifier.load(conf, keyName, Text.class);
+ assertEquals("DefaultStringifier#load() or #store() might be flawed"
+ , text, claimedText);
+
+ }
+
+ public void testStoreLoadArray() throws IOException {
+ LOG.info("Testing DefaultStringifier#storeArray() and #loadArray()");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization");
+
+ String keyName = "test.defaultstringifier.key2";
+
+ Integer[] array = new Integer[] {1,2,3,4,5};
+
+
+ DefaultStringifier.storeArray(conf, array, keyName);
+
+ Integer[] claimedArray = DefaultStringifier.loadArray(conf, keyName, Integer.class);
+ for (int i = 0; i < array.length; i++) {
+ assertEquals("two arrays are not equal", array[i], claimedArray[i]);
+ }
+
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/TestEnumSetWritable.java b/src/test/org/apache/hadoop/io/TestEnumSetWritable.java
new file mode 100644
index 00000000000..a512bb1bc2d
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestEnumSetWritable.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.IOException;
+import java.util.EnumSet;
+
+import junit.framework.TestCase;
+
+/** Unit test for EnumSetWritable */
+public class TestEnumSetWritable extends TestCase {
+
+ enum TestEnumSet {
+ CREATE, OVERWRITE, APPEND;
+ }
+
+ EnumSet nonEmptyFlag = EnumSet.of(TestEnumSet.APPEND);
+ EnumSetWritable nonEmptyFlagWritable = new EnumSetWritable(
+ nonEmptyFlag);
+
+ @SuppressWarnings("unchecked")
+ public void testSerializeAndDeserializeNonEmpty() throws IOException {
+ DataOutputBuffer out = new DataOutputBuffer();
+ ObjectWritable.writeObject(out, nonEmptyFlagWritable, nonEmptyFlagWritable
+ .getClass(), null);
+ DataInputBuffer in = new DataInputBuffer();
+ in.reset(out.getData(), out.getLength());
+ EnumSet read = ((EnumSetWritable) ObjectWritable
+ .readObject(in, null)).get();
+ assertEquals(read, nonEmptyFlag);
+ }
+
+ EnumSet emptyFlag = EnumSet.noneOf(TestEnumSet.class);
+
+ @SuppressWarnings("unchecked")
+ public void testSerializeAndDeserializeEmpty() throws IOException {
+
+ boolean gotException = false;
+ try {
+ new EnumSetWritable(emptyFlag);
+ } catch (RuntimeException e) {
+ gotException = true;
+ }
+
+ assertTrue(
+ "Instantiate empty EnumSetWritable with no element type class providesd should throw exception.",
+ gotException);
+
+ EnumSetWritable emptyFlagWritable = new EnumSetWritable(
+ emptyFlag, TestEnumSet.class);
+ DataOutputBuffer out = new DataOutputBuffer();
+ ObjectWritable.writeObject(out, emptyFlagWritable, emptyFlagWritable
+ .getClass(), null);
+ DataInputBuffer in = new DataInputBuffer();
+ in.reset(out.getData(), out.getLength());
+ EnumSet read = ((EnumSetWritable) ObjectWritable
+ .readObject(in, null)).get();
+ assertEquals(read, emptyFlag);
+ }
+
+ @SuppressWarnings("unchecked")
+ public void testSerializeAndDeserializeNull() throws IOException {
+
+ boolean gotException = false;
+ try {
+ new EnumSetWritable(null);
+ } catch (RuntimeException e) {
+ gotException = true;
+ }
+
+ assertTrue(
+ "Instantiate empty EnumSetWritable with no element type class providesd should throw exception.",
+ gotException);
+
+ EnumSetWritable nullFlagWritable = new EnumSetWritable(
+ null, TestEnumSet.class);
+
+ DataOutputBuffer out = new DataOutputBuffer();
+ ObjectWritable.writeObject(out, nullFlagWritable, nullFlagWritable
+ .getClass(), null);
+ DataInputBuffer in = new DataInputBuffer();
+ in.reset(out.getData(), out.getLength());
+ EnumSet read = ((EnumSetWritable) ObjectWritable
+ .readObject(in, null)).get();
+ assertEquals(read, null);
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestGenericWritable.java b/src/test/org/apache/hadoop/io/TestGenericWritable.java
new file mode 100644
index 00000000000..486d93d4385
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestGenericWritable.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * TestCase for {@link GenericWritable} class.
+ * @see TestWritable#testWritable(Writable)
+ */
+public class TestGenericWritable extends TestCase {
+
+ private Configuration conf;
+ public static final String CONF_TEST_KEY = "test.generic.writable";
+ public static final String CONF_TEST_VALUE = "dummy";
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ conf = new Configuration();
+ //set the configuration parameter
+ conf.set(CONF_TEST_KEY, CONF_TEST_VALUE);
+ }
+
+ /** Dummy class for testing {@link GenericWritable} */
+ public static class Foo implements Writable {
+ private String foo = "foo";
+ public void readFields(DataInput in) throws IOException {
+ foo = Text.readString(in);
+ }
+ public void write(DataOutput out) throws IOException {
+ Text.writeString(out, foo);
+ }
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof Foo))
+ return false;
+ return this.foo.equals(((Foo)obj).foo);
+ }
+ }
+ /** Dummy class for testing {@link GenericWritable} */
+ public static class Bar implements Writable, Configurable {
+ private int bar = 42; //The Answer to The Ultimate Question Of Life, the Universe and Everything
+ private Configuration conf = null;
+ public void readFields(DataInput in) throws IOException {
+ bar = in.readInt();
+ }
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(bar);
+ }
+ public Configuration getConf() {
+ return conf;
+ }
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof Bar))
+ return false;
+ return this.bar == ((Bar)obj).bar;
+ }
+ }
+
+ /** Dummy class for testing {@link GenericWritable} */
+ public static class Baz extends Bar {
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ super.readFields(in);
+ //needs a configuration parameter
+ assertEquals("Configuration is not set for the wrapped object",
+ CONF_TEST_VALUE, getConf().get(CONF_TEST_KEY));
+ }
+ @Override
+ public void write(DataOutput out) throws IOException {
+ super.write(out);
+ }
+ }
+
+ /** Dummy class for testing {@link GenericWritable} */
+ public static class FooGenericWritable extends GenericWritable {
+ @Override
+ @SuppressWarnings("unchecked")
+ protected Class extends Writable>[] getTypes() {
+ return new Class[] {Foo.class, Bar.class, Baz.class};
+ }
+ @Override
+ public boolean equals(Object obj) {
+ if(! (obj instanceof FooGenericWritable))
+ return false;
+ return get().equals(((FooGenericWritable)obj).get());
+ }
+ }
+
+ public void testFooWritable() throws Exception {
+ System.out.println("Testing Writable wrapped in GenericWritable");
+ FooGenericWritable generic = new FooGenericWritable();
+ generic.setConf(conf);
+ Foo foo = new Foo();
+ generic.set(foo);
+ TestWritable.testWritable(generic);
+ }
+
+ public void testBarWritable() throws Exception {
+ System.out.println("Testing Writable, Configurable wrapped in GenericWritable");
+ FooGenericWritable generic = new FooGenericWritable();
+ generic.setConf(conf);
+ Bar bar = new Bar();
+ bar.setConf(conf);
+ generic.set(bar);
+
+ //test writing generic writable
+ FooGenericWritable after
+ = (FooGenericWritable)TestWritable.testWritable(generic, conf);
+
+ //test configuration
+ System.out.println("Testing if Configuration is passed to wrapped classes");
+ assertTrue(after.get() instanceof Configurable);
+ assertNotNull(((Configurable)after.get()).getConf());
+ }
+
+ public void testBazWritable() throws Exception {
+ System.out.println("Testing for GenericWritable to find class names");
+ FooGenericWritable generic = new FooGenericWritable();
+ generic.setConf(conf);
+ Baz baz = new Baz();
+ generic.set(baz);
+ TestWritable.testWritable(generic, conf);
+ }
+
+ public void testSet() throws Exception {
+ Foo foo = new Foo();
+ FooGenericWritable generic = new FooGenericWritable();
+ //exception should not occur
+ generic.set(foo);
+
+ try {
+ //exception should occur, since IntWritable is not registered
+ generic = new FooGenericWritable();
+ generic.set(new IntWritable(1));
+ fail("Generic writable should have thrown an exception for a Writable not registered");
+ }catch (RuntimeException e) {
+ //ignore
+ }
+
+ }
+
+ public void testGet() throws Exception {
+ Foo foo = new Foo();
+ FooGenericWritable generic = new FooGenericWritable();
+ generic.set(foo);
+ assertEquals(foo, generic.get());
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/TestMD5Hash.java b/src/test/org/apache/hadoop/io/TestMD5Hash.java
new file mode 100644
index 00000000000..feb1107ed46
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestMD5Hash.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import org.apache.hadoop.io.TestWritable;
+import junit.framework.TestCase;
+import java.security.MessageDigest;
+import java.util.Random;
+
+/** Unit tests for MD5Hash. */
+public class TestMD5Hash extends TestCase {
+ public TestMD5Hash(String name) { super(name); }
+
+ private static final Random RANDOM = new Random();
+
+ public static MD5Hash getTestHash() throws Exception {
+ MessageDigest digest = MessageDigest.getInstance("MD5");
+ byte[] buffer = new byte[1024];
+ RANDOM.nextBytes(buffer);
+ digest.update(buffer);
+ return new MD5Hash(digest.digest());
+ }
+
+ protected static byte[] D00 = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ protected static byte[] DFF = new byte[] {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+ public void testMD5Hash() throws Exception {
+ MD5Hash md5Hash = getTestHash();
+
+ final MD5Hash md5Hash00
+ = new MD5Hash(D00);
+
+ final MD5Hash md5HashFF
+ = new MD5Hash(DFF);
+
+ MD5Hash orderedHash = new MD5Hash(new byte[]{1,2,3,4,5,6,7,8,9,10,11,12,
+ 13,14,15,16});
+ MD5Hash backwardHash = new MD5Hash(new byte[]{-1,-2,-3,-4,-5,-6,-7,-8,
+ -9,-10,-11,-12, -13, -14,
+ -15,-16});
+ MD5Hash closeHash1 = new MD5Hash(new byte[]{-1,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0});
+ MD5Hash closeHash2 = new MD5Hash(new byte[]{-1,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0});
+
+ // test i/o
+ TestWritable.testWritable(md5Hash);
+ TestWritable.testWritable(md5Hash00);
+ TestWritable.testWritable(md5HashFF);
+
+ // test equals()
+ assertEquals(md5Hash, md5Hash);
+ assertEquals(md5Hash00, md5Hash00);
+ assertEquals(md5HashFF, md5HashFF);
+
+ // test compareTo()
+ assertTrue(md5Hash.compareTo(md5Hash) == 0);
+ assertTrue(md5Hash00.compareTo(md5Hash) < 0);
+ assertTrue(md5HashFF.compareTo(md5Hash) > 0);
+
+ // test toString and string ctor
+ assertEquals(md5Hash, new MD5Hash(md5Hash.toString()));
+ assertEquals(md5Hash00, new MD5Hash(md5Hash00.toString()));
+ assertEquals(md5HashFF, new MD5Hash(md5HashFF.toString()));
+
+ assertEquals(0x01020304, orderedHash.quarterDigest());
+ assertEquals(0xfffefdfc, backwardHash.quarterDigest());
+
+ assertEquals(0x0102030405060708L, orderedHash.halfDigest());
+ assertEquals(0xfffefdfcfbfaf9f8L, backwardHash.halfDigest());
+ assertTrue("hash collision",
+ closeHash1.hashCode() != closeHash2.hashCode());
+
+ Thread t1 = new Thread() {
+ public void run() {
+ for (int i = 0; i < 100; i++) {
+ MD5Hash hash = new MD5Hash(DFF);
+ assertEquals(hash, md5HashFF);
+ }
+ }
+ };
+
+ Thread t2 = new Thread() {
+ public void run() {
+ for (int i = 0; i < 100; i++) {
+ MD5Hash hash = new MD5Hash(D00);
+ assertEquals(hash, md5Hash00);
+ }
+ }
+ };
+
+ t1.start();
+ t2.start();
+ t1.join();
+ t2.join();
+
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/TestMapFile.java b/src/test/org/apache/hadoop/io/TestMapFile.java
new file mode 100644
index 00000000000..f006d4f4013
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestMapFile.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.io;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import junit.framework.TestCase;
+
+public class TestMapFile extends TestCase {
+ private static Configuration conf = new Configuration();
+
+ /**
+ * Test getClosest feature.
+ * @throws Exception
+ */
+ public void testGetClosest() throws Exception {
+ // Write a mapfile of simple data: keys are
+ Path dirName = new Path(System.getProperty("test.build.data",".") +
+ getName() + ".mapfile");
+ FileSystem fs = FileSystem.getLocal(conf);
+ Path qualifiedDirName = fs.makeQualified(dirName);
+ // Make an index entry for every third insertion.
+ MapFile.Writer.setIndexInterval(conf, 3);
+ MapFile.Writer writer = new MapFile.Writer(conf, fs,
+ qualifiedDirName.toString(), Text.class, Text.class);
+ // Assert that the index interval is 1
+ assertEquals(3, writer.getIndexInterval());
+ // Add entries up to 100 in intervals of ten.
+ final int FIRST_KEY = 10;
+ for (int i = FIRST_KEY; i < 100; i += 10) {
+ String iStr = Integer.toString(i);
+ Text t = new Text("00".substring(iStr.length()) + iStr);
+ writer.append(t, t);
+ }
+ writer.close();
+ // Now do getClosest on created mapfile.
+ MapFile.Reader reader = new MapFile.Reader(fs, qualifiedDirName.toString(),
+ conf);
+ Text key = new Text("55");
+ Text value = new Text();
+ Text closest = (Text)reader.getClosest(key, value);
+ // Assert that closest after 55 is 60
+ assertEquals(new Text("60"), closest);
+ // Get closest that falls before the passed key: 50
+ closest = (Text)reader.getClosest(key, value, true);
+ assertEquals(new Text("50"), closest);
+ // Test get closest when we pass explicit key
+ final Text TWENTY = new Text("20");
+ closest = (Text)reader.getClosest(TWENTY, value);
+ assertEquals(TWENTY, closest);
+ closest = (Text)reader.getClosest(TWENTY, value, true);
+ assertEquals(TWENTY, closest);
+ // Test what happens at boundaries. Assert if searching a key that is
+ // less than first key in the mapfile, that the first key is returned.
+ key = new Text("00");
+ closest = (Text)reader.getClosest(key, value);
+ assertEquals(FIRST_KEY, Integer.parseInt(closest.toString()));
+
+ // If we're looking for the first key before, and we pass in a key before
+ // the first key in the file, we should get null
+ closest = (Text)reader.getClosest(key, value, true);
+ assertNull(closest);
+
+ // Assert that null is returned if key is > last entry in mapfile.
+ key = new Text("99");
+ closest = (Text)reader.getClosest(key, value);
+ assertNull(closest);
+
+ // If we were looking for the key before, we should get the last key
+ closest = (Text)reader.getClosest(key, value, true);
+ assertEquals(new Text("90"), closest);
+ }
+
+ public void testMidKey() throws Exception {
+ // Write a mapfile of simple data: keys are
+ Path dirName = new Path(System.getProperty("test.build.data",".") +
+ getName() + ".mapfile");
+ FileSystem fs = FileSystem.getLocal(conf);
+ Path qualifiedDirName = fs.makeQualified(dirName);
+
+ MapFile.Writer writer = new MapFile.Writer(conf, fs,
+ qualifiedDirName.toString(), IntWritable.class, IntWritable.class);
+ writer.append(new IntWritable(1), new IntWritable(1));
+ writer.close();
+ // Now do getClosest on created mapfile.
+ MapFile.Reader reader = new MapFile.Reader(fs, qualifiedDirName.toString(),
+ conf);
+ assertEquals(new IntWritable(1), reader.midKey());
+ }
+
+
+ public void testMidKeyEmpty() throws Exception {
+ // Write a mapfile of simple data: keys are
+ Path dirName = new Path(System.getProperty("test.build.data",".") +
+ getName() + ".mapfile");
+ FileSystem fs = FileSystem.getLocal(conf);
+ Path qualifiedDirName = fs.makeQualified(dirName);
+
+ MapFile.Writer writer = new MapFile.Writer(conf, fs,
+ qualifiedDirName.toString(), IntWritable.class, IntWritable.class);
+ writer.close();
+ // Now do getClosest on created mapfile.
+ MapFile.Reader reader = new MapFile.Reader(fs, qualifiedDirName.toString(),
+ conf);
+ assertEquals(null, reader.midKey());
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestMapWritable.java b/src/test/org/apache/hadoop/io/TestMapWritable.java
new file mode 100644
index 00000000000..3d8c4ab3c20
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestMapWritable.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests MapWritable
+ */
+public class TestMapWritable extends TestCase {
+ /** the test */
+ @SuppressWarnings("unchecked")
+ public void testMapWritable() {
+ Text[] keys = {
+ new Text("key1"),
+ new Text("key2"),
+ new Text("Key3"),
+ };
+
+ BytesWritable[] values = {
+ new BytesWritable("value1".getBytes()),
+ new BytesWritable("value2".getBytes()),
+ new BytesWritable("value3".getBytes())
+ };
+
+ MapWritable inMap = new MapWritable();
+ for (int i = 0; i < keys.length; i++) {
+ inMap.put(keys[i], values[i]);
+ }
+
+ MapWritable outMap = new MapWritable(inMap);
+ assertEquals(inMap.size(), outMap.size());
+
+ for (Map.Entry e: inMap.entrySet()) {
+ assertTrue(outMap.containsKey(e.getKey()));
+ assertEquals(0, ((WritableComparable) outMap.get(e.getKey())).compareTo(
+ e.getValue()));
+ }
+
+ // Now for something a little harder...
+
+ Text[] maps = {
+ new Text("map1"),
+ new Text("map2")
+ };
+
+ MapWritable mapOfMaps = new MapWritable();
+ mapOfMaps.put(maps[0], inMap);
+ mapOfMaps.put(maps[1], outMap);
+
+ MapWritable copyOfMapOfMaps = new MapWritable(mapOfMaps);
+ for (int i = 0; i < maps.length; i++) {
+ assertTrue(copyOfMapOfMaps.containsKey(maps[i]));
+ MapWritable a = (MapWritable) mapOfMaps.get(maps[i]);
+ MapWritable b = (MapWritable) copyOfMapOfMaps.get(maps[i]);
+ assertEquals(a.size(), b.size());
+ for (Writable key: a.keySet()) {
+ assertTrue(b.containsKey(key));
+
+ // This will work because we know what we put into each set
+
+ WritableComparable aValue = (WritableComparable) a.get(key);
+ WritableComparable bValue = (WritableComparable) b.get(key);
+ assertEquals(0, aValue.compareTo(bValue));
+ }
+ }
+ }
+
+ /**
+ * Test that number of "unknown" classes is propagated across multiple copies.
+ */
+ @SuppressWarnings("deprecation")
+ public void testForeignClass() {
+ MapWritable inMap = new MapWritable();
+ inMap.put(new Text("key"), new UTF8("value"));
+ inMap.put(new Text("key2"), new UTF8("value2"));
+ MapWritable outMap = new MapWritable(inMap);
+ MapWritable copyOfCopy = new MapWritable(outMap);
+ assertEquals(1, copyOfCopy.getNewClasses());
+ }
+
+ /**
+ * Assert MapWritable does not grow across calls to readFields.
+ * @throws Exception
+ * @see HADOOP-2244
+ */
+ public void testMultipleCallsToReadFieldsAreSafe() throws Exception {
+ // Create an instance and add a key/value.
+ MapWritable m = new MapWritable();
+ final Text t = new Text(getName());
+ m.put(t, t);
+ // Get current size of map. Key values are 't'.
+ int count = m.size();
+ // Now serialize... save off the bytes.
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(baos);
+ m.write(dos);
+ dos.close();
+ // Now add new values to the MapWritable.
+ m.put(new Text("key1"), new Text("value1"));
+ m.put(new Text("key2"), new Text("value2"));
+ // Now deserialize the original MapWritable. Ensure count and key values
+ // match original state.
+ ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
+ DataInputStream dis = new DataInputStream(bais);
+ m.readFields(dis);
+ assertEquals(count, m.size());
+ assertTrue(m.get(t).equals(t));
+ dis.close();
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestSequenceFileSerialization.java b/src/test/org/apache/hadoop/io/TestSequenceFileSerialization.java
new file mode 100644
index 00000000000..c9fc1eae4f5
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestSequenceFileSerialization.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.hadoop.io.SequenceFile.Writer;
+
+public class TestSequenceFileSerialization extends TestCase {
+
+ private Configuration conf;
+ private FileSystem fs;
+
+ @Override
+ protected void setUp() throws Exception {
+ conf = new Configuration();
+ conf.set("io.serializations",
+ "org.apache.hadoop.io.serializer.JavaSerialization");
+ fs = FileSystem.getLocal(conf);
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ fs.close();
+ }
+
+ public void testJavaSerialization() throws Exception {
+ Path file = new Path(System.getProperty("test.build.data",".") +
+ "/test.seq");
+
+ fs.delete(file, true);
+ Writer writer = SequenceFile.createWriter(fs, conf, file, Long.class,
+ String.class);
+
+ writer.append(1L, "one");
+ writer.append(2L, "two");
+
+ writer.close();
+
+ Reader reader = new Reader(fs, file, conf);
+ assertEquals(1L, reader.next((Object) null));
+ assertEquals("one", reader.getCurrentValue((Object) null));
+ assertEquals(2L, reader.next((Object) null));
+ assertEquals("two", reader.getCurrentValue((Object) null));
+ assertNull(reader.next((Object) null));
+ reader.close();
+
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestSetFile.java b/src/test/org/apache/hadoop/io/TestSetFile.java
new file mode 100644
index 00000000000..70d02e013f0
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestSetFile.java
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.*;
+import java.util.*;
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.*;
+
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+
+/** Support for flat files of binary key/value pairs. */
+public class TestSetFile extends TestCase {
+ private static final Log LOG = LogFactory.getLog(TestSetFile.class);
+ private static String FILE =
+ System.getProperty("test.build.data",".") + "/test.set";
+
+ private static Configuration conf = new Configuration();
+
+ public TestSetFile(String name) { super(name); }
+
+ public void testSetFile() throws Exception {
+ FileSystem fs = FileSystem.getLocal(conf);
+ try {
+ RandomDatum[] data = generate(10000);
+ writeTest(fs, data, FILE, CompressionType.NONE);
+ readTest(fs, data, FILE);
+
+ writeTest(fs, data, FILE, CompressionType.BLOCK);
+ readTest(fs, data, FILE);
+ } finally {
+ fs.close();
+ }
+ }
+
+ private static RandomDatum[] generate(int count) {
+ LOG.info("generating " + count + " records in memory");
+ RandomDatum[] data = new RandomDatum[count];
+ RandomDatum.Generator generator = new RandomDatum.Generator();
+ for (int i = 0; i < count; i++) {
+ generator.next();
+ data[i] = generator.getValue();
+ }
+ LOG.info("sorting " + count + " records");
+ Arrays.sort(data);
+ return data;
+ }
+
+ private static void writeTest(FileSystem fs, RandomDatum[] data,
+ String file, CompressionType compress)
+ throws IOException {
+ MapFile.delete(fs, file);
+ LOG.info("creating with " + data.length + " records");
+ SetFile.Writer writer =
+ new SetFile.Writer(conf, fs, file,
+ WritableComparator.get(RandomDatum.class),
+ compress);
+ for (int i = 0; i < data.length; i++)
+ writer.append(data[i]);
+ writer.close();
+ }
+
+ private static void readTest(FileSystem fs, RandomDatum[] data, String file)
+ throws IOException {
+ RandomDatum v = new RandomDatum();
+ int sample = (int)Math.sqrt(data.length);
+ Random random = new Random();
+ LOG.info("reading " + sample + " records");
+ SetFile.Reader reader = new SetFile.Reader(fs, file, conf);
+ for (int i = 0; i < sample; i++) {
+ if (!reader.seek(data[random.nextInt(data.length)]))
+ throw new RuntimeException("wrong value at " + i);
+ }
+ reader.close();
+ LOG.info("done reading " + data.length);
+ }
+
+
+ /** For debugging and testing. */
+ public static void main(String[] args) throws Exception {
+ int count = 1024 * 1024;
+ boolean create = true;
+ boolean check = true;
+ String file = FILE;
+ String compress = "NONE";
+
+ String usage = "Usage: TestSetFile [-count N] [-nocreate] [-nocheck] [-compress type] file";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ int i = 0;
+ Path fpath=null;
+ FileSystem fs = null;
+ try {
+ for (; i < args.length; i++) { // parse command line
+ if (args[i] == null) {
+ continue;
+ } else if (args[i].equals("-count")) {
+ count = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-nocreate")) {
+ create = false;
+ } else if (args[i].equals("-nocheck")) {
+ check = false;
+ } else if (args[i].equals("-compress")) {
+ compress = args[++i];
+ } else {
+ // file is required parameter
+ file = args[i];
+ fpath=new Path(file);
+ }
+ }
+
+ fs = fpath.getFileSystem(conf);
+
+ LOG.info("count = " + count);
+ LOG.info("create = " + create);
+ LOG.info("check = " + check);
+ LOG.info("compress = " + compress);
+ LOG.info("file = " + file);
+
+ RandomDatum[] data = generate(count);
+
+ if (create) {
+ writeTest(fs, data, file, CompressionType.valueOf(compress));
+ }
+
+ if (check) {
+ readTest(fs, data, file);
+ }
+
+ } finally {
+ fs.close();
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestSortedMapWritable.java b/src/test/org/apache/hadoop/io/TestSortedMapWritable.java
new file mode 100644
index 00000000000..927bfc1f42d
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestSortedMapWritable.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.io;
+
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests SortedMapWritable
+ */
+public class TestSortedMapWritable extends TestCase {
+ /** the test */
+ @SuppressWarnings("unchecked")
+ public void testSortedMapWritable() {
+ Text[] keys = {
+ new Text("key1"),
+ new Text("key2"),
+ new Text("key3"),
+ };
+
+ BytesWritable[] values = {
+ new BytesWritable("value1".getBytes()),
+ new BytesWritable("value2".getBytes()),
+ new BytesWritable("value3".getBytes())
+ };
+
+ SortedMapWritable inMap = new SortedMapWritable();
+ for (int i = 0; i < keys.length; i++) {
+ inMap.put(keys[i], values[i]);
+ }
+
+ assertEquals(0, inMap.firstKey().compareTo(keys[0]));
+ assertEquals(0, inMap.lastKey().compareTo(keys[2]));
+
+ SortedMapWritable outMap = new SortedMapWritable(inMap);
+ assertEquals(inMap.size(), outMap.size());
+
+ for (Map.Entry e: inMap.entrySet()) {
+ assertTrue(outMap.containsKey(e.getKey()));
+ assertEquals(0, ((WritableComparable) outMap.get(e.getKey())).compareTo(
+ e.getValue()));
+ }
+
+ // Now for something a little harder...
+
+ Text[] maps = {
+ new Text("map1"),
+ new Text("map2")
+ };
+
+ SortedMapWritable mapOfMaps = new SortedMapWritable();
+ mapOfMaps.put(maps[0], inMap);
+ mapOfMaps.put(maps[1], outMap);
+
+ SortedMapWritable copyOfMapOfMaps = new SortedMapWritable(mapOfMaps);
+ for (int i = 0; i < maps.length; i++) {
+ assertTrue(copyOfMapOfMaps.containsKey(maps[i]));
+
+ SortedMapWritable a = (SortedMapWritable) mapOfMaps.get(maps[i]);
+ SortedMapWritable b = (SortedMapWritable) copyOfMapOfMaps.get(maps[i]);
+ assertEquals(a.size(), b.size());
+ for (Writable key: a.keySet()) {
+ assertTrue(b.containsKey(key));
+
+ // This will work because we know what we put into each set
+
+ WritableComparable aValue = (WritableComparable) a.get(key);
+ WritableComparable bValue = (WritableComparable) b.get(key);
+ assertEquals(0, aValue.compareTo(bValue));
+ }
+ }
+ }
+
+ /**
+ * Test that number of "unknown" classes is propagated across multiple copies.
+ */
+ @SuppressWarnings("deprecation")
+ public void testForeignClass() {
+ SortedMapWritable inMap = new SortedMapWritable();
+ inMap.put(new Text("key"), new UTF8("value"));
+ inMap.put(new Text("key2"), new UTF8("value2"));
+ SortedMapWritable outMap = new SortedMapWritable(inMap);
+ SortedMapWritable copyOfCopy = new SortedMapWritable(outMap);
+ assertEquals(1, copyOfCopy.getNewClasses());
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/TestText.java b/src/test/org/apache/hadoop/io/TestText.java
new file mode 100644
index 00000000000..6e004860991
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestText.java
@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.util.Random;
+
+/** Unit tests for LargeUTF8. */
+public class TestText extends TestCase {
+ private static final int NUM_ITERATIONS = 100;
+ public TestText(String name) { super(name); }
+
+ private static final Random RANDOM = new Random(1);
+
+ private static final int RAND_LEN = -1;
+
+ // generate a valid java String
+ private static String getTestString(int len) throws Exception {
+ StringBuffer buffer = new StringBuffer();
+ int length = (len==RAND_LEN) ? RANDOM.nextInt(1000) : len;
+ while (buffer.length() test = WritableName.getClass("long",conf);
+ assertTrue(test != null);
+ }
+
+ public void testSetName() throws Exception {
+ Configuration conf = new Configuration();
+ WritableName.setName(SimpleWritable.class, testName);
+
+ Class> test = WritableName.getClass(testName,conf);
+ assertTrue(test.equals(SimpleWritable.class));
+ }
+
+
+ public void testAddName() throws Exception {
+ Configuration conf = new Configuration();
+ String altName = testName + ".alt";
+
+ WritableName.addName(SimpleWritable.class, altName);
+
+ Class> test = WritableName.getClass(altName, conf);
+ assertTrue(test.equals(SimpleWritable.class));
+
+ // check original name still works
+ test = WritableName.getClass(testName, conf);
+ assertTrue(test.equals(SimpleWritable.class));
+
+ }
+
+ public void testBadName() throws Exception {
+ Configuration conf = new Configuration();
+ try {
+ Class> test = WritableName.getClass("unknown_junk",conf);
+ assertTrue(false);
+ } catch(IOException e) {
+ assertTrue(e.getMessage().matches(".*unknown_junk.*"));
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/TestWritableUtils.java b/src/test/org/apache/hadoop/io/TestWritableUtils.java
new file mode 100644
index 00000000000..2487fc0612c
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/TestWritableUtils.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import junit.framework.TestCase;
+
+public class TestWritableUtils extends TestCase {
+ private static final Log LOG = LogFactory.getLog(TestWritableUtils.class);
+
+ public static void testValue(int val, int vintlen) throws IOException {
+ DataOutputBuffer buf = new DataOutputBuffer();
+ DataInputBuffer inbuf = new DataInputBuffer();
+ WritableUtils.writeVInt(buf, val);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Value = " + val);
+ BytesWritable printer = new BytesWritable();
+ printer.set(buf.getData(), 0, buf.getLength());
+ LOG.debug("Buffer = " + printer);
+ }
+ inbuf.reset(buf.getData(), 0, buf.getLength());
+ assertEquals(val, WritableUtils.readVInt(inbuf));
+ assertEquals(vintlen, buf.getLength());
+ assertEquals(vintlen, WritableUtils.getVIntSize(val));
+ assertEquals(vintlen, WritableUtils.decodeVIntSize(buf.getData()[0]));
+ }
+
+ public static void testVInt() throws Exception {
+ testValue(12, 1);
+ testValue(127, 1);
+ testValue(-112, 1);
+ testValue(-113, 2);
+ testValue(-128, 2);
+ testValue(128, 2);
+ testValue(-129, 2);
+ testValue(255, 2);
+ testValue(-256, 2);
+ testValue(256, 3);
+ testValue(-257, 3);
+ testValue(65535, 3);
+ testValue(-65536, 3);
+ testValue(65536, 4);
+ testValue(-65537, 4);
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/compress/TestCodec.java b/src/test/org/apache/hadoop/io/compress/TestCodec.java
new file mode 100644
index 00000000000..38e4a358376
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/compress/TestCodec.java
@@ -0,0 +1,249 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.io.compress;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.RandomDatum;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.compress.CompressionOutputStream;
+import org.apache.hadoop.io.compress.zlib.ZlibFactory;
+
+public class TestCodec extends TestCase {
+
+ private static final Log LOG=
+ LogFactory.getLog(TestCodec.class);
+
+ private Configuration conf = new Configuration();
+ private int count = 10000;
+ private int seed = new Random().nextInt();
+
+ public void testDefaultCodec() throws IOException {
+ codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.DefaultCodec");
+ codecTest(conf, seed, count, "org.apache.hadoop.io.compress.DefaultCodec");
+ }
+
+ public void testGzipCodec() throws IOException {
+ codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.GzipCodec");
+ codecTest(conf, seed, count, "org.apache.hadoop.io.compress.GzipCodec");
+ }
+
+ public void testBZip2Codec() throws IOException {
+ codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.BZip2Codec");
+ codecTest(conf, seed, count, "org.apache.hadoop.io.compress.BZip2Codec");
+ }
+
+ private static void codecTest(Configuration conf, int seed, int count,
+ String codecClass)
+ throws IOException {
+
+ // Create the codec
+ CompressionCodec codec = null;
+ try {
+ codec = (CompressionCodec)
+ ReflectionUtils.newInstance(conf.getClassByName(codecClass), conf);
+ } catch (ClassNotFoundException cnfe) {
+ throw new IOException("Illegal codec!");
+ }
+ LOG.info("Created a Codec object of type: " + codecClass);
+
+ // Generate data
+ DataOutputBuffer data = new DataOutputBuffer();
+ RandomDatum.Generator generator = new RandomDatum.Generator(seed);
+ for(int i=0; i < count; ++i) {
+ generator.next();
+ RandomDatum key = generator.getKey();
+ RandomDatum value = generator.getValue();
+
+ key.write(data);
+ value.write(data);
+ }
+ DataInputBuffer originalData = new DataInputBuffer();
+ DataInputStream originalIn = new DataInputStream(new BufferedInputStream(originalData));
+ originalData.reset(data.getData(), 0, data.getLength());
+
+ LOG.info("Generated " + count + " records");
+
+ // Compress data
+ DataOutputBuffer compressedDataBuffer = new DataOutputBuffer();
+ CompressionOutputStream deflateFilter =
+ codec.createOutputStream(compressedDataBuffer);
+ DataOutputStream deflateOut =
+ new DataOutputStream(new BufferedOutputStream(deflateFilter));
+ deflateOut.write(data.getData(), 0, data.getLength());
+ deflateOut.flush();
+ deflateFilter.finish();
+ LOG.info("Finished compressing data");
+
+ // De-compress data
+ DataInputBuffer deCompressedDataBuffer = new DataInputBuffer();
+ deCompressedDataBuffer.reset(compressedDataBuffer.getData(), 0,
+ compressedDataBuffer.getLength());
+ CompressionInputStream inflateFilter =
+ codec.createInputStream(deCompressedDataBuffer);
+ DataInputStream inflateIn =
+ new DataInputStream(new BufferedInputStream(inflateFilter));
+
+ // Check
+ for(int i=0; i < count; ++i) {
+ RandomDatum k1 = new RandomDatum();
+ RandomDatum v1 = new RandomDatum();
+ k1.readFields(originalIn);
+ v1.readFields(originalIn);
+
+ RandomDatum k2 = new RandomDatum();
+ RandomDatum v2 = new RandomDatum();
+ k2.readFields(inflateIn);
+ v2.readFields(inflateIn);
+ }
+ LOG.info("SUCCESS! Completed checking " + count + " records");
+ }
+
+ public void testCodecPoolGzipReuse() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setBoolean("hadoop.native.lib", true);
+ if (!ZlibFactory.isNativeZlibLoaded(conf)) {
+ LOG.warn("testCodecPoolGzipReuse skipped: native libs not loaded");
+ return;
+ }
+ GzipCodec gzc = ReflectionUtils.newInstance(GzipCodec.class, conf);
+ DefaultCodec dfc = ReflectionUtils.newInstance(DefaultCodec.class, conf);
+ Compressor c1 = CodecPool.getCompressor(gzc);
+ Compressor c2 = CodecPool.getCompressor(dfc);
+ CodecPool.returnCompressor(c1);
+ CodecPool.returnCompressor(c2);
+ assertTrue("Got mismatched ZlibCompressor", c2 != CodecPool.getCompressor(gzc));
+ }
+
+ public void testSequenceFileDefaultCodec() throws IOException, ClassNotFoundException,
+ InstantiationException, IllegalAccessException {
+ sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.DefaultCodec", 100);
+ sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.DefaultCodec", 1000000);
+ }
+
+ public void testSequenceFileBZip2Codec() throws IOException, ClassNotFoundException,
+ InstantiationException, IllegalAccessException {
+ sequenceFileCodecTest(conf, 0, "org.apache.hadoop.io.compress.BZip2Codec", 100);
+ sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.BZip2Codec", 100);
+ sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.BZip2Codec", 1000000);
+ }
+
+ private static void sequenceFileCodecTest(Configuration conf, int lines,
+ String codecClass, int blockSize)
+ throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
+
+ Path filePath = new Path("SequenceFileCodecTest." + codecClass);
+ // Configuration
+ conf.setInt("io.seqfile.compress.blocksize", blockSize);
+
+ // Create the SequenceFile
+ FileSystem fs = FileSystem.get(conf);
+ LOG.info("Creating SequenceFile with codec \"" + codecClass + "\"");
+ SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, filePath,
+ Text.class, Text.class, CompressionType.BLOCK,
+ (CompressionCodec)Class.forName(codecClass).newInstance());
+
+ // Write some data
+ LOG.info("Writing to SequenceFile...");
+ for (int i=0; i getCompressorType() {
+ return null;
+ }
+
+ public Compressor createCompressor() {
+ return null;
+ }
+
+ public CompressionInputStream createInputStream(InputStream in,
+ Decompressor decompressor)
+ throws IOException {
+ return null;
+ }
+
+ public CompressionInputStream createInputStream(InputStream in)
+ throws IOException {
+ return null;
+ }
+
+ public CompressionOutputStream createOutputStream(OutputStream out,
+ Compressor compressor)
+ throws IOException {
+ return null;
+ }
+
+ public Class extends Decompressor> getDecompressorType() {
+ return null;
+ }
+
+ public Decompressor createDecompressor() {
+ return null;
+ }
+
+ public String getDefaultExtension() {
+ return ".base";
+ }
+ }
+
+ private static class BarCodec extends BaseCodec {
+ public String getDefaultExtension() {
+ return "bar";
+ }
+ }
+
+ private static class FooBarCodec extends BaseCodec {
+ public String getDefaultExtension() {
+ return ".foo.bar";
+ }
+ }
+
+ private static class FooCodec extends BaseCodec {
+ public String getDefaultExtension() {
+ return ".foo";
+ }
+ }
+
+ /**
+ * Returns a factory for a given set of codecs
+ * @param classes the codec classes to include
+ * @return a new factory
+ */
+ private static CompressionCodecFactory setClasses(Class[] classes) {
+ Configuration conf = new Configuration();
+ CompressionCodecFactory.setCodecClasses(conf, Arrays.asList(classes));
+ return new CompressionCodecFactory(conf);
+ }
+
+ private static void checkCodec(String msg,
+ Class expected, CompressionCodec actual) {
+ assertEquals(msg + " unexpected codec found",
+ expected.getName(),
+ actual.getClass().getName());
+ }
+
+ public static void testFinding() {
+ CompressionCodecFactory factory =
+ new CompressionCodecFactory(new Configuration());
+ CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar"));
+ assertEquals("default factory foo codec", null, codec);
+ codec = factory.getCodec(new Path("/tmp/foo.gz"));
+ checkCodec("default factory for .gz", GzipCodec.class, codec);
+ codec = factory.getCodec(new Path("/tmp/foo.bz2"));
+ checkCodec("default factory for .bz2", BZip2Codec.class, codec);
+ factory = setClasses(new Class[0]);
+ codec = factory.getCodec(new Path("/tmp/foo.bar"));
+ assertEquals("empty codec bar codec", null, codec);
+ codec = factory.getCodec(new Path("/tmp/foo.gz"));
+ assertEquals("empty codec gz codec", null, codec);
+ codec = factory.getCodec(new Path("/tmp/foo.bz2"));
+ assertEquals("default factory for .bz2", null, codec);
+ factory = setClasses(new Class[]{BarCodec.class, FooCodec.class,
+ FooBarCodec.class});
+ codec = factory.getCodec(new Path("/tmp/.foo.bar.gz"));
+ assertEquals("full factory gz codec", null, codec);
+ codec = factory.getCodec(new Path("/tmp/foo.bz2"));
+ assertEquals("default factory for .bz2", null, codec);
+ codec = factory.getCodec(new Path("/tmp/foo.bar"));
+ checkCodec("full factory bar codec", BarCodec.class, codec);
+ codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
+ checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
+ codec = factory.getCodec(new Path("/tmp/foo.foo"));
+ checkCodec("full factory foo codec", FooCodec.class, codec);
+ }
+}
diff --git a/src/test/org/apache/hadoop/io/retry/TestRetryProxy.java b/src/test/org/apache/hadoop/io/retry/TestRetryProxy.java
new file mode 100644
index 00000000000..c48e87b7dd9
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/retry/TestRetryProxy.java
@@ -0,0 +1,170 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io.retry;
+
+import static org.apache.hadoop.io.retry.RetryPolicies.RETRY_FOREVER;
+import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_DONT_FAIL;
+import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_THEN_FAIL;
+import static org.apache.hadoop.io.retry.RetryPolicies.retryByException;
+import static org.apache.hadoop.io.retry.RetryPolicies.retryByRemoteException;
+import static org.apache.hadoop.io.retry.RetryPolicies.retryUpToMaximumCountWithFixedSleep;
+import static org.apache.hadoop.io.retry.RetryPolicies.retryUpToMaximumCountWithProportionalSleep;
+import static org.apache.hadoop.io.retry.RetryPolicies.retryUpToMaximumTimeWithFixedSleep;
+import static org.apache.hadoop.io.retry.RetryPolicies.exponentialBackoffRetry;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.io.retry.UnreliableInterface.FatalException;
+import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException;
+import org.apache.hadoop.ipc.RemoteException;
+
+public class TestRetryProxy extends TestCase {
+
+ private UnreliableImplementation unreliableImpl;
+
+ @Override
+ protected void setUp() throws Exception {
+ unreliableImpl = new UnreliableImplementation();
+ }
+
+ public void testTryOnceThenFail() throws UnreliableException {
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl, TRY_ONCE_THEN_FAIL);
+ unreliable.alwaysSucceeds();
+ try {
+ unreliable.failsOnceThenSucceeds();
+ fail("Should fail");
+ } catch (UnreliableException e) {
+ // expected
+ }
+ }
+
+ public void testTryOnceDontFail() throws UnreliableException {
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl, TRY_ONCE_DONT_FAIL);
+ unreliable.alwaysSucceeds();
+ unreliable.failsOnceThenSucceeds();
+ try {
+ unreliable.failsOnceThenSucceedsWithReturnValue();
+ fail("Should fail");
+ } catch (UnreliableException e) {
+ // expected
+ }
+ }
+
+ public void testRetryForever() throws UnreliableException {
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl, RETRY_FOREVER);
+ unreliable.alwaysSucceeds();
+ unreliable.failsOnceThenSucceeds();
+ unreliable.failsTenTimesThenSucceeds();
+ }
+
+ public void testRetryUpToMaximumCountWithFixedSleep() throws UnreliableException {
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl,
+ retryUpToMaximumCountWithFixedSleep(8, 1, TimeUnit.NANOSECONDS));
+ unreliable.alwaysSucceeds();
+ unreliable.failsOnceThenSucceeds();
+ try {
+ unreliable.failsTenTimesThenSucceeds();
+ fail("Should fail");
+ } catch (UnreliableException e) {
+ // expected
+ }
+ }
+
+ public void testRetryUpToMaximumTimeWithFixedSleep() throws UnreliableException {
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl,
+ retryUpToMaximumTimeWithFixedSleep(80, 10, TimeUnit.NANOSECONDS));
+ unreliable.alwaysSucceeds();
+ unreliable.failsOnceThenSucceeds();
+ try {
+ unreliable.failsTenTimesThenSucceeds();
+ fail("Should fail");
+ } catch (UnreliableException e) {
+ // expected
+ }
+ }
+
+ public void testRetryUpToMaximumCountWithProportionalSleep() throws UnreliableException {
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl,
+ retryUpToMaximumCountWithProportionalSleep(8, 1, TimeUnit.NANOSECONDS));
+ unreliable.alwaysSucceeds();
+ unreliable.failsOnceThenSucceeds();
+ try {
+ unreliable.failsTenTimesThenSucceeds();
+ fail("Should fail");
+ } catch (UnreliableException e) {
+ // expected
+ }
+ }
+
+ public void testExponentialRetry() throws UnreliableException {
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl,
+ exponentialBackoffRetry(5, 1L, TimeUnit.NANOSECONDS));
+ unreliable.alwaysSucceeds();
+ unreliable.failsOnceThenSucceeds();
+ try {
+ unreliable.failsTenTimesThenSucceeds();
+ fail("Should fail");
+ } catch (UnreliableException e) {
+ // expected
+ }
+ }
+
+ public void testRetryByException() throws UnreliableException {
+ Map, RetryPolicy> exceptionToPolicyMap =
+ Collections., RetryPolicy>singletonMap(FatalException.class, TRY_ONCE_THEN_FAIL);
+
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl,
+ retryByException(RETRY_FOREVER, exceptionToPolicyMap));
+ unreliable.failsOnceThenSucceeds();
+ try {
+ unreliable.alwaysFailsWithFatalException();
+ fail("Should fail");
+ } catch (FatalException e) {
+ // expected
+ }
+ }
+
+ public void testRetryByRemoteException() throws UnreliableException {
+ Map, RetryPolicy> exceptionToPolicyMap =
+ Collections., RetryPolicy>singletonMap(FatalException.class, TRY_ONCE_THEN_FAIL);
+
+ UnreliableInterface unreliable = (UnreliableInterface)
+ RetryProxy.create(UnreliableInterface.class, unreliableImpl,
+ retryByRemoteException(RETRY_FOREVER, exceptionToPolicyMap));
+ try {
+ unreliable.alwaysFailsWithRemoteFatalException();
+ fail("Should fail");
+ } catch (RemoteException e) {
+ // expected
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/retry/UnreliableImplementation.java b/src/test/org/apache/hadoop/io/retry/UnreliableImplementation.java
new file mode 100644
index 00000000000..5971ee72165
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/retry/UnreliableImplementation.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io.retry;
+
+import org.apache.hadoop.ipc.RemoteException;
+
+public class UnreliableImplementation implements UnreliableInterface {
+
+ private int failsOnceInvocationCount,
+ failsOnceWithValueInvocationCount,
+ failsTenTimesInvocationCount;
+
+ public void alwaysSucceeds() {
+ // do nothing
+ }
+
+ public void alwaysFailsWithFatalException() throws FatalException {
+ throw new FatalException();
+ }
+
+ public void alwaysFailsWithRemoteFatalException() throws RemoteException {
+ throw new RemoteException(FatalException.class.getName(), "Oops");
+ }
+
+ public void failsOnceThenSucceeds() throws UnreliableException {
+ if (failsOnceInvocationCount++ == 0) {
+ throw new UnreliableException();
+ }
+ }
+
+ public boolean failsOnceThenSucceedsWithReturnValue() throws UnreliableException {
+ if (failsOnceWithValueInvocationCount++ == 0) {
+ throw new UnreliableException();
+ }
+ return true;
+ }
+
+ public void failsTenTimesThenSucceeds() throws UnreliableException {
+ if (failsTenTimesInvocationCount++ < 10) {
+ throw new UnreliableException();
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/io/retry/UnreliableInterface.java b/src/test/org/apache/hadoop/io/retry/UnreliableInterface.java
new file mode 100644
index 00000000000..af4959151e7
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/retry/UnreliableInterface.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io.retry;
+
+import org.apache.hadoop.ipc.RemoteException;
+
+public interface UnreliableInterface {
+
+ public static class UnreliableException extends Exception {
+ // no body
+ }
+
+ public static class FatalException extends UnreliableException {
+ // no body
+ }
+
+ void alwaysSucceeds() throws UnreliableException;
+
+ void alwaysFailsWithFatalException() throws FatalException;
+ void alwaysFailsWithRemoteFatalException() throws RemoteException;
+
+ void failsOnceThenSucceeds() throws UnreliableException;
+ boolean failsOnceThenSucceedsWithReturnValue() throws UnreliableException;
+
+ void failsTenTimesThenSucceeds() throws UnreliableException;
+}
diff --git a/src/test/org/apache/hadoop/io/serializer/TestWritableSerialization.java b/src/test/org/apache/hadoop/io/serializer/TestWritableSerialization.java
new file mode 100644
index 00000000000..6a551753245
--- /dev/null
+++ b/src/test/org/apache/hadoop/io/serializer/TestWritableSerialization.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io.serializer;
+
+import static org.apache.hadoop.io.TestGenericWritable.CONF_TEST_KEY;
+import static org.apache.hadoop.io.TestGenericWritable.CONF_TEST_VALUE;
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.TestGenericWritable.Baz;
+import org.apache.hadoop.io.TestGenericWritable.FooGenericWritable;
+import org.apache.hadoop.util.GenericsUtil;
+
+public class TestWritableSerialization extends TestCase {
+
+ private static final Configuration conf = new Configuration();
+
+ static {
+ conf.set("io.serializations"
+ , "org.apache.hadoop.io.serializer.WritableSerialization");
+ }
+
+ public void testWritableSerialization() throws Exception {
+ Text before = new Text("test writable");
+ testSerialization(conf, before);
+ }
+
+
+ public void testWritableConfigurable() throws Exception {
+
+ //set the configuration parameter
+ conf.set(CONF_TEST_KEY, CONF_TEST_VALUE);
+
+ //reuse TestGenericWritable inner classes to test
+ //writables that also implement Configurable.
+ FooGenericWritable generic = new FooGenericWritable();
+ generic.setConf(conf);
+ Baz baz = new Baz();
+ generic.set(baz);
+ Baz result = testSerialization(conf, baz);
+ assertNotNull(result.getConf());
+ }
+
+ /**
+ * A utility that tests serialization/deserialization.
+ * @param the class of the item
+ * @param conf configuration to use, "io.serializations" is read to
+ * determine the serialization
+ * @param before item to (de)serialize
+ * @return deserialized item
+ */
+ public static K testSerialization(Configuration conf, K before)
+ throws Exception {
+
+ SerializationFactory factory = new SerializationFactory(conf);
+ Serializer serializer
+ = factory.getSerializer(GenericsUtil.getClass(before));
+ Deserializer deserializer
+ = factory.getDeserializer(GenericsUtil.getClass(before));
+
+ DataOutputBuffer out = new DataOutputBuffer();
+ serializer.open(out);
+ serializer.serialize(before);
+ serializer.close();
+
+ DataInputBuffer in = new DataInputBuffer();
+ in.reset(out.getData(), out.getLength());
+ deserializer.open(in);
+ K after = deserializer.deserialize(null);
+ deserializer.close();
+
+ assertEquals(before, after);
+ return after;
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/ipc/TestIPC.java b/src/test/org/apache/hadoop/ipc/TestIPC.java
new file mode 100644
index 00000000000..df5a1558153
--- /dev/null
+++ b/src/test/org/apache/hadoop/ipc/TestIPC.java
@@ -0,0 +1,243 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ipc;
+
+import org.apache.commons.logging.*;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.net.NetUtils;
+
+import java.util.Random;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+
+/** Unit tests for IPC. */
+public class TestIPC extends TestCase {
+ public static final Log LOG =
+ LogFactory.getLog(TestIPC.class);
+
+ final private static Configuration conf = new Configuration();
+ final static private int PING_INTERVAL = 1000;
+
+ static {
+ Client.setPingInterval(conf, PING_INTERVAL);
+ }
+ public TestIPC(String name) { super(name); }
+
+ private static final Random RANDOM = new Random();
+
+ private static final String ADDRESS = "0.0.0.0";
+
+ private static class TestServer extends Server {
+ private boolean sleep;
+
+ public TestServer(int handlerCount, boolean sleep)
+ throws IOException {
+ super(ADDRESS, 0, LongWritable.class, handlerCount, conf);
+ this.sleep = sleep;
+ }
+
+ @Override
+ public Writable call(Class> protocol, Writable param, long receiveTime)
+ throws IOException {
+ if (sleep) {
+ try {
+ Thread.sleep(RANDOM.nextInt(2*PING_INTERVAL)); // sleep a bit
+ } catch (InterruptedException e) {}
+ }
+ return param; // echo param as result
+ }
+ }
+
+ private static class SerialCaller extends Thread {
+ private Client client;
+ private InetSocketAddress server;
+ private int count;
+ private boolean failed;
+
+ public SerialCaller(Client client, InetSocketAddress server, int count) {
+ this.client = client;
+ this.server = server;
+ this.count = count;
+ }
+
+ public void run() {
+ for (int i = 0; i < count; i++) {
+ try {
+ LongWritable param = new LongWritable(RANDOM.nextLong());
+ LongWritable value =
+ (LongWritable)client.call(param, server);
+ if (!param.equals(value)) {
+ LOG.fatal("Call failed!");
+ failed = true;
+ break;
+ }
+ } catch (Exception e) {
+ LOG.fatal("Caught: " + StringUtils.stringifyException(e));
+ failed = true;
+ }
+ }
+ }
+ }
+
+ private static class ParallelCaller extends Thread {
+ private Client client;
+ private int count;
+ private InetSocketAddress[] addresses;
+ private boolean failed;
+
+ public ParallelCaller(Client client, InetSocketAddress[] addresses,
+ int count) {
+ this.client = client;
+ this.addresses = addresses;
+ this.count = count;
+ }
+
+ public void run() {
+ for (int i = 0; i < count; i++) {
+ try {
+ Writable[] params = new Writable[addresses.length];
+ for (int j = 0; j < addresses.length; j++)
+ params[j] = new LongWritable(RANDOM.nextLong());
+ Writable[] values = client.call(params, addresses);
+ for (int j = 0; j < addresses.length; j++) {
+ if (!params[j].equals(values[j])) {
+ LOG.fatal("Call failed!");
+ failed = true;
+ break;
+ }
+ }
+ } catch (Exception e) {
+ LOG.fatal("Caught: " + StringUtils.stringifyException(e));
+ failed = true;
+ }
+ }
+ }
+ }
+
+ public void testSerial() throws Exception {
+ testSerial(3, false, 2, 5, 100);
+ }
+
+ public void testSerial(int handlerCount, boolean handlerSleep,
+ int clientCount, int callerCount, int callCount)
+ throws Exception {
+ Server server = new TestServer(handlerCount, handlerSleep);
+ InetSocketAddress addr = NetUtils.getConnectAddress(server);
+ server.start();
+
+ Client[] clients = new Client[clientCount];
+ for (int i = 0; i < clientCount; i++) {
+ clients[i] = new Client(LongWritable.class, conf);
+ }
+
+ SerialCaller[] callers = new SerialCaller[callerCount];
+ for (int i = 0; i < callerCount; i++) {
+ callers[i] = new SerialCaller(clients[i%clientCount], addr, callCount);
+ callers[i].start();
+ }
+ for (int i = 0; i < callerCount; i++) {
+ callers[i].join();
+ assertFalse(callers[i].failed);
+ }
+ for (int i = 0; i < clientCount; i++) {
+ clients[i].stop();
+ }
+ server.stop();
+ }
+
+ public void testParallel() throws Exception {
+ testParallel(10, false, 2, 4, 2, 4, 100);
+ }
+
+ public void testParallel(int handlerCount, boolean handlerSleep,
+ int serverCount, int addressCount,
+ int clientCount, int callerCount, int callCount)
+ throws Exception {
+ Server[] servers = new Server[serverCount];
+ for (int i = 0; i < serverCount; i++) {
+ servers[i] = new TestServer(handlerCount, handlerSleep);
+ servers[i].start();
+ }
+
+ InetSocketAddress[] addresses = new InetSocketAddress[addressCount];
+ for (int i = 0; i < addressCount; i++) {
+ addresses[i] = NetUtils.getConnectAddress(servers[i%serverCount]);
+ }
+
+ Client[] clients = new Client[clientCount];
+ for (int i = 0; i < clientCount; i++) {
+ clients[i] = new Client(LongWritable.class, conf);
+ }
+
+ ParallelCaller[] callers = new ParallelCaller[callerCount];
+ for (int i = 0; i < callerCount; i++) {
+ callers[i] =
+ new ParallelCaller(clients[i%clientCount], addresses, callCount);
+ callers[i].start();
+ }
+ for (int i = 0; i < callerCount; i++) {
+ callers[i].join();
+ assertFalse(callers[i].failed);
+ }
+ for (int i = 0; i < clientCount; i++) {
+ clients[i].stop();
+ }
+ for (int i = 0; i < serverCount; i++) {
+ servers[i].stop();
+ }
+ }
+
+ public void testStandAloneClient() throws Exception {
+ testParallel(10, false, 2, 4, 2, 4, 100);
+ Client client = new Client(LongWritable.class, conf);
+ InetSocketAddress address = new InetSocketAddress("127.0.0.1", 10);
+ try {
+ client.call(new LongWritable(RANDOM.nextLong()),
+ address);
+ fail("Expected an exception to have been thrown");
+ } catch (IOException e) {
+ String message = e.getMessage();
+ String addressText = address.toString();
+ assertTrue("Did not find "+addressText+" in "+message,
+ message.contains(addressText));
+ Throwable cause=e.getCause();
+ assertNotNull("No nested exception in "+e,cause);
+ String causeText=cause.getMessage();
+ assertTrue("Did not find " + causeText + " in " + message,
+ message.contains(causeText));
+ }
+ }
+
+
+ public static void main(String[] args) throws Exception {
+
+ //new TestIPC("test").testSerial(5, false, 2, 10, 1000);
+
+ new TestIPC("test").testParallel(10, false, 2, 4, 2, 4, 1000);
+
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/ipc/TestIPCServerResponder.java b/src/test/org/apache/hadoop/ipc/TestIPCServerResponder.java
new file mode 100644
index 00000000000..2591da01432
--- /dev/null
+++ b/src/test/org/apache/hadoop/ipc/TestIPCServerResponder.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ipc;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.net.NetUtils;
+
+/**
+ * This test provokes partial writes in the server, which is
+ * serving multiple clients.
+ */
+public class TestIPCServerResponder extends TestCase {
+
+ public static final Log LOG =
+ LogFactory.getLog(TestIPCServerResponder.class);
+
+ private static Configuration conf = new Configuration();
+
+ public TestIPCServerResponder(final String name) {
+ super(name);
+ }
+
+ private static final Random RANDOM = new Random();
+
+ private static final String ADDRESS = "0.0.0.0";
+
+ private static final int BYTE_COUNT = 1024;
+ private static final byte[] BYTES = new byte[BYTE_COUNT];
+ static {
+ for (int i = 0; i < BYTE_COUNT; i++)
+ BYTES[i] = (byte) ('a' + (i % 26));
+ }
+
+ private static class TestServer extends Server {
+
+ private boolean sleep;
+
+ public TestServer(final int handlerCount, final boolean sleep)
+ throws IOException {
+ super(ADDRESS, 0, BytesWritable.class, handlerCount, conf);
+ // Set the buffer size to half of the maximum parameter/result size
+ // to force the socket to block
+ this.setSocketSendBufSize(BYTE_COUNT / 2);
+ this.sleep = sleep;
+ }
+
+ @Override
+ public Writable call(Class> protocol, Writable param, long receiveTime)
+ throws IOException {
+ if (sleep) {
+ try {
+ Thread.sleep(RANDOM.nextInt(20)); // sleep a bit
+ } catch (InterruptedException e) {}
+ }
+ return param;
+ }
+ }
+
+ private static class Caller extends Thread {
+
+ private Client client;
+ private int count;
+ private InetSocketAddress address;
+ private boolean failed;
+
+ public Caller(final Client client, final InetSocketAddress address,
+ final int count) {
+ this.client = client;
+ this.address = address;
+ this.count = count;
+ }
+
+ @Override
+ public void run() {
+ for (int i = 0; i < count; i++) {
+ try {
+ int byteSize = RANDOM.nextInt(BYTE_COUNT);
+ byte[] bytes = new byte[byteSize];
+ System.arraycopy(BYTES, 0, bytes, 0, byteSize);
+ Writable param = new BytesWritable(bytes);
+ Writable value = client.call(param, address);
+ Thread.sleep(RANDOM.nextInt(20));
+ } catch (Exception e) {
+ LOG.fatal("Caught: " + e);
+ failed = true;
+ }
+ }
+ }
+ }
+
+ public void testServerResponder() throws Exception {
+ testServerResponder(10, true, 1, 10, 200);
+ }
+
+ public void testServerResponder(final int handlerCount,
+ final boolean handlerSleep,
+ final int clientCount,
+ final int callerCount,
+ final int callCount) throws Exception {
+ Server server = new TestServer(handlerCount, handlerSleep);
+ server.start();
+
+ InetSocketAddress address = NetUtils.getConnectAddress(server);
+ Client[] clients = new Client[clientCount];
+ for (int i = 0; i < clientCount; i++) {
+ clients[i] = new Client(BytesWritable.class, conf);
+ }
+
+ Caller[] callers = new Caller[callerCount];
+ for (int i = 0; i < callerCount; i++) {
+ callers[i] = new Caller(clients[i % clientCount], address, callCount);
+ callers[i].start();
+ }
+ for (int i = 0; i < callerCount; i++) {
+ callers[i].join();
+ assertFalse(callers[i].failed);
+ }
+ for (int i = 0; i < clientCount; i++) {
+ clients[i].stop();
+ }
+ server.stop();
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/ipc/TestRPC.java b/src/test/org/apache/hadoop/ipc/TestRPC.java
new file mode 100644
index 00000000000..d0db263cc1a
--- /dev/null
+++ b/src/test/org/apache/hadoop/ipc/TestRPC.java
@@ -0,0 +1,391 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ipc;
+
+import java.io.IOException;
+import java.net.ConnectException;
+import java.net.InetSocketAddress;
+import java.lang.reflect.Method;
+
+import junit.framework.TestCase;
+
+import java.util.Arrays;
+
+import org.apache.commons.logging.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.security.authorize.AuthorizationException;
+import org.apache.hadoop.security.authorize.ConfiguredPolicy;
+import org.apache.hadoop.security.authorize.PolicyProvider;
+import org.apache.hadoop.security.authorize.Service;
+import org.apache.hadoop.security.authorize.ServiceAuthorizationManager;
+
+/** Unit tests for RPC. */
+public class TestRPC extends TestCase {
+ private static final String ADDRESS = "0.0.0.0";
+
+ public static final Log LOG =
+ LogFactory.getLog(TestRPC.class);
+
+ private static Configuration conf = new Configuration();
+
+ int datasize = 1024*100;
+ int numThreads = 50;
+
+ public TestRPC(String name) { super(name); }
+
+ public interface TestProtocol extends VersionedProtocol {
+ public static final long versionID = 1L;
+
+ void ping() throws IOException;
+ void slowPing(boolean shouldSlow) throws IOException;
+ String echo(String value) throws IOException;
+ String[] echo(String[] value) throws IOException;
+ Writable echo(Writable value) throws IOException;
+ int add(int v1, int v2) throws IOException;
+ int add(int[] values) throws IOException;
+ int error() throws IOException;
+ void testServerGet() throws IOException;
+ int[] exchange(int[] values) throws IOException;
+ }
+
+ public class TestImpl implements TestProtocol {
+ int fastPingCounter = 0;
+
+ public long getProtocolVersion(String protocol, long clientVersion) {
+ return TestProtocol.versionID;
+ }
+
+ public void ping() {}
+
+ public synchronized void slowPing(boolean shouldSlow) {
+ if (shouldSlow) {
+ while (fastPingCounter < 2) {
+ try {
+ wait(); // slow response until two fast pings happened
+ } catch (InterruptedException ignored) {}
+ }
+ fastPingCounter -= 2;
+ } else {
+ fastPingCounter++;
+ notify();
+ }
+ }
+
+ public String echo(String value) throws IOException { return value; }
+
+ public String[] echo(String[] values) throws IOException { return values; }
+
+ public Writable echo(Writable writable) {
+ return writable;
+ }
+ public int add(int v1, int v2) {
+ return v1 + v2;
+ }
+
+ public int add(int[] values) {
+ int sum = 0;
+ for (int i = 0; i < values.length; i++) {
+ sum += values[i];
+ }
+ return sum;
+ }
+
+ public int error() throws IOException {
+ throw new IOException("bobo");
+ }
+
+ public void testServerGet() throws IOException {
+ if (!(Server.get() instanceof RPC.Server)) {
+ throw new IOException("Server.get() failed");
+ }
+ }
+
+ public int[] exchange(int[] values) {
+ for (int i = 0; i < values.length; i++) {
+ values[i] = i;
+ }
+ return values;
+ }
+ }
+
+ //
+ // an object that does a bunch of transactions
+ //
+ static class Transactions implements Runnable {
+ int datasize;
+ TestProtocol proxy;
+
+ Transactions(TestProtocol proxy, int datasize) {
+ this.proxy = proxy;
+ this.datasize = datasize;
+ }
+
+ // do two RPC that transfers data.
+ public void run() {
+ int[] indata = new int[datasize];
+ int[] outdata = null;
+ int val = 0;
+ try {
+ outdata = proxy.exchange(indata);
+ val = proxy.add(1,2);
+ } catch (IOException e) {
+ assertTrue("Exception from RPC exchange() " + e, false);
+ }
+ assertEquals(indata.length, outdata.length);
+ assertEquals(val, 3);
+ for (int i = 0; i < outdata.length; i++) {
+ assertEquals(outdata[i], i);
+ }
+ }
+ }
+
+ //
+ // A class that does an RPC but does not read its response.
+ //
+ static class SlowRPC implements Runnable {
+ private TestProtocol proxy;
+ private volatile boolean done;
+
+ SlowRPC(TestProtocol proxy) {
+ this.proxy = proxy;
+ done = false;
+ }
+
+ boolean isDone() {
+ return done;
+ }
+
+ public void run() {
+ try {
+ proxy.slowPing(true); // this would hang until two fast pings happened
+ done = true;
+ } catch (IOException e) {
+ assertTrue("SlowRPC ping exception " + e, false);
+ }
+ }
+ }
+
+ public void testSlowRpc() throws Exception {
+ System.out.println("Testing Slow RPC");
+ // create a server with two handlers
+ Server server = RPC.getServer(new TestImpl(), ADDRESS, 0, 2, false, conf);
+ TestProtocol proxy = null;
+
+ try {
+ server.start();
+
+ InetSocketAddress addr = NetUtils.getConnectAddress(server);
+
+ // create a client
+ proxy = (TestProtocol)RPC.getProxy(
+ TestProtocol.class, TestProtocol.versionID, addr, conf);
+
+ SlowRPC slowrpc = new SlowRPC(proxy);
+ Thread thread = new Thread(slowrpc, "SlowRPC");
+ thread.start(); // send a slow RPC, which won't return until two fast pings
+ assertTrue("Slow RPC should not have finished1.", !slowrpc.isDone());
+
+ proxy.slowPing(false); // first fast ping
+
+ // verify that the first RPC is still stuck
+ assertTrue("Slow RPC should not have finished2.", !slowrpc.isDone());
+
+ proxy.slowPing(false); // second fast ping
+
+ // Now the slow ping should be able to be executed
+ while (!slowrpc.isDone()) {
+ System.out.println("Waiting for slow RPC to get done.");
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {}
+ }
+ } finally {
+ server.stop();
+ if (proxy != null) {
+ RPC.stopProxy(proxy);
+ }
+ System.out.println("Down slow rpc testing");
+ }
+ }
+
+
+ public void testCalls() throws Exception {
+ Server server = RPC.getServer(new TestImpl(), ADDRESS, 0, conf);
+ TestProtocol proxy = null;
+ try {
+ server.start();
+
+ InetSocketAddress addr = NetUtils.getConnectAddress(server);
+ proxy = (TestProtocol)RPC.getProxy(
+ TestProtocol.class, TestProtocol.versionID, addr, conf);
+
+ proxy.ping();
+
+ String stringResult = proxy.echo("foo");
+ assertEquals(stringResult, "foo");
+
+ stringResult = proxy.echo((String)null);
+ assertEquals(stringResult, null);
+
+ String[] stringResults = proxy.echo(new String[]{"foo","bar"});
+ assertTrue(Arrays.equals(stringResults, new String[]{"foo","bar"}));
+
+ stringResults = proxy.echo((String[])null);
+ assertTrue(Arrays.equals(stringResults, null));
+
+ UTF8 utf8Result = (UTF8)proxy.echo(new UTF8("hello world"));
+ assertEquals(utf8Result, new UTF8("hello world"));
+
+ utf8Result = (UTF8)proxy.echo((UTF8)null);
+ assertEquals(utf8Result, null);
+
+ int intResult = proxy.add(1, 2);
+ assertEquals(intResult, 3);
+
+ intResult = proxy.add(new int[] {1, 2});
+ assertEquals(intResult, 3);
+
+ boolean caught = false;
+ try {
+ proxy.error();
+ } catch (IOException e) {
+ LOG.debug("Caught " + e);
+ caught = true;
+ }
+ assertTrue(caught);
+
+ proxy.testServerGet();
+
+ // create multiple threads and make them do large data transfers
+ System.out.println("Starting multi-threaded RPC test...");
+ server.setSocketSendBufSize(1024);
+ Thread threadId[] = new Thread[numThreads];
+ for (int i = 0; i < numThreads; i++) {
+ Transactions trans = new Transactions(proxy, datasize);
+ threadId[i] = new Thread(trans, "TransactionThread-" + i);
+ threadId[i].start();
+ }
+
+ // wait for all transactions to get over
+ System.out.println("Waiting for all threads to finish RPCs...");
+ for (int i = 0; i < numThreads; i++) {
+ try {
+ threadId[i].join();
+ } catch (InterruptedException e) {
+ i--; // retry
+ }
+ }
+
+ // try some multi-calls
+ Method echo =
+ TestProtocol.class.getMethod("echo", new Class[] { String.class });
+ String[] strings = (String[])RPC.call(echo, new String[][]{{"a"},{"b"}},
+ new InetSocketAddress[] {addr, addr}, conf);
+ assertTrue(Arrays.equals(strings, new String[]{"a","b"}));
+
+ Method ping = TestProtocol.class.getMethod("ping", new Class[] {});
+ Object[] voids = (Object[])RPC.call(ping, new Object[][]{{},{}},
+ new InetSocketAddress[] {addr, addr}, conf);
+ assertEquals(voids, null);
+ } finally {
+ server.stop();
+ if(proxy!=null) RPC.stopProxy(proxy);
+ }
+ }
+
+ public void testStandaloneClient() throws IOException {
+ try {
+ RPC.waitForProxy(TestProtocol.class,
+ TestProtocol.versionID, new InetSocketAddress(ADDRESS, 20), conf, 15000L);
+ fail("We should not have reached here");
+ } catch (ConnectException ioe) {
+ //this is what we expected
+ }
+ }
+
+ private static final String ACL_CONFIG = "test.protocol.acl";
+
+ private static class TestPolicyProvider extends PolicyProvider {
+
+ @Override
+ public Service[] getServices() {
+ return new Service[] { new Service(ACL_CONFIG, TestProtocol.class) };
+ }
+
+ }
+
+ private void doRPCs(Configuration conf, boolean expectFailure) throws Exception {
+ SecurityUtil.setPolicy(new ConfiguredPolicy(conf, new TestPolicyProvider()));
+
+ Server server = RPC.getServer(new TestImpl(), ADDRESS, 0, 5, true, conf);
+
+ TestProtocol proxy = null;
+
+ server.start();
+
+ InetSocketAddress addr = NetUtils.getConnectAddress(server);
+
+ try {
+ proxy = (TestProtocol)RPC.getProxy(
+ TestProtocol.class, TestProtocol.versionID, addr, conf);
+ proxy.ping();
+
+ if (expectFailure) {
+ fail("Expect RPC.getProxy to fail with AuthorizationException!");
+ }
+ } catch (RemoteException e) {
+ if (expectFailure) {
+ assertTrue(e.unwrapRemoteException() instanceof AuthorizationException);
+ } else {
+ throw e;
+ }
+ } finally {
+ server.stop();
+ if (proxy != null) {
+ RPC.stopProxy(proxy);
+ }
+ }
+ }
+
+ public void testAuthorization() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setBoolean(
+ ServiceAuthorizationManager.SERVICE_AUTHORIZATION_CONFIG, true);
+
+ // Expect to succeed
+ conf.set(ACL_CONFIG, "*");
+ doRPCs(conf, false);
+
+ // Reset authorization to expect failure
+ conf.set(ACL_CONFIG, "invalid invalid");
+ doRPCs(conf, true);
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ new TestRPC("test").testCalls();
+
+ }
+}
diff --git a/src/test/org/apache/hadoop/log/TestLogLevel.java b/src/test/org/apache/hadoop/log/TestLogLevel.java
new file mode 100644
index 00000000000..f2443c04d90
--- /dev/null
+++ b/src/test/org/apache/hadoop/log/TestLogLevel.java
@@ -0,0 +1,78 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.hadoop.log;
+
+import java.io.*;
+import java.net.*;
+
+import org.apache.hadoop.http.HttpServer;
+
+import junit.framework.TestCase;
+import org.apache.commons.logging.*;
+import org.apache.commons.logging.impl.*;
+import org.apache.log4j.*;
+
+public class TestLogLevel extends TestCase {
+ static final PrintStream out = System.out;
+
+ public void testDynamicLogLevel() throws Exception {
+ String logName = TestLogLevel.class.getName();
+ Log testlog = LogFactory.getLog(logName);
+
+ //only test Log4JLogger
+ if (testlog instanceof Log4JLogger) {
+ Logger log = ((Log4JLogger)testlog).getLogger();
+ log.debug("log.debug1");
+ log.info("log.info1");
+ log.error("log.error1");
+ assertTrue(!Level.ERROR.equals(log.getEffectiveLevel()));
+
+ HttpServer server = new HttpServer("..", "localhost", 22222, true);
+ server.start();
+ int port = server.getPort();
+
+ //servlet
+ URL url = new URL("http://localhost:" + port
+ + "/logLevel?log=" + logName + "&level=" + Level.ERROR);
+ out.println("*** Connecting to " + url);
+ URLConnection connection = url.openConnection();
+ connection.connect();
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ for(String line; (line = in.readLine()) != null; out.println(line));
+ in.close();
+
+ log.debug("log.debug2");
+ log.info("log.info2");
+ log.error("log.error2");
+ assertTrue(Level.ERROR.equals(log.getEffectiveLevel()));
+
+ //command line
+ String[] args = {"-setlevel", "localhost:"+port, logName,""+Level.DEBUG};
+ LogLevel.main(args);
+ log.debug("log.debug3");
+ log.info("log.info3");
+ log.error("log.error3");
+ assertTrue(Level.DEBUG.equals(log.getEffectiveLevel()));
+ }
+ else {
+ out.println(testlog.getClass() + " not tested.");
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/metrics/TestMetricsServlet.java b/src/test/org/apache/hadoop/metrics/TestMetricsServlet.java
new file mode 100644
index 00000000000..8d5cfc9a553
--- /dev/null
+++ b/src/test/org/apache/hadoop/metrics/TestMetricsServlet.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.metrics;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.metrics.MetricsServlet.TagsMetricsPair;
+import org.apache.hadoop.metrics.spi.NoEmitMetricsContext;
+import org.apache.hadoop.metrics.spi.OutputRecord;
+import org.mortbay.util.ajax.JSON;
+
+public class TestMetricsServlet extends TestCase {
+ MetricsContext nc1;
+ MetricsContext nc2;
+ // List containing nc1 and nc2.
+ List contexts;
+ OutputRecord outputRecord;
+
+ /**
+ * Initializes, for testing, two NoEmitMetricsContext's, and adds one value
+ * to the first of them.
+ */
+ public void setUp() throws IOException {
+ nc1 = new NoEmitMetricsContext();
+ nc1.init("test1", ContextFactory.getFactory());
+ nc2 = new NoEmitMetricsContext();
+ nc2.init("test2", ContextFactory.getFactory());
+ contexts = new ArrayList();
+ contexts.add(nc1);
+ contexts.add(nc2);
+
+ MetricsRecord r = nc1.createRecord("testRecord");
+
+ r.setTag("testTag1", "testTagValue1");
+ r.setTag("testTag2", "testTagValue2");
+ r.setMetric("testMetric1", 1);
+ r.setMetric("testMetric2", 33);
+ r.update();
+
+ Map> m = nc1.getAllRecords();
+ assertEquals(1, m.size());
+ assertEquals(1, m.values().size());
+ Collection outputRecords = m.values().iterator().next();
+ assertEquals(1, outputRecords.size());
+ outputRecord = outputRecords.iterator().next();
+ }
+
+
+
+ public void testTagsMetricsPair() throws IOException {
+ TagsMetricsPair pair = new TagsMetricsPair(outputRecord.getTagsCopy(),
+ outputRecord.getMetricsCopy());
+ String s = JSON.toString(pair);
+ assertEquals(
+ "[{\"testTag1\":\"testTagValue1\",\"testTag2\":\"testTagValue2\"},"+
+ "{\"testMetric1\":1,\"testMetric2\":33}]", s);
+ }
+
+ public void testGetMap() throws IOException {
+ MetricsServlet servlet = new MetricsServlet();
+ Map>> m = servlet.makeMap(contexts);
+ assertEquals("Map missing contexts", 2, m.size());
+ assertTrue(m.containsKey("test1"));
+
+ Map> m2 = m.get("test1");
+
+ assertEquals("Missing records", 1, m2.size());
+ assertTrue(m2.containsKey("testRecord"));
+ assertEquals("Wrong number of tags-values pairs.", 1, m2.get("testRecord").size());
+ }
+
+ public void testPrintMap() throws IOException {
+ StringWriter sw = new StringWriter();
+ PrintWriter out = new PrintWriter(sw);
+ MetricsServlet servlet = new MetricsServlet();
+ servlet.printMap(out, servlet.makeMap(contexts));
+
+ String EXPECTED = "" +
+ "test1\n" +
+ " testRecord\n" +
+ " {testTag1=testTagValue1,testTag2=testTagValue2}:\n" +
+ " testMetric1=1\n" +
+ " testMetric2=33\n" +
+ "test2\n";
+ assertEquals(EXPECTED, sw.toString());
+ }
+}
diff --git a/src/test/org/apache/hadoop/metrics/spi/TestOutputRecord.java b/src/test/org/apache/hadoop/metrics/spi/TestOutputRecord.java
new file mode 100644
index 00000000000..02e94a9f1b0
--- /dev/null
+++ b/src/test/org/apache/hadoop/metrics/spi/TestOutputRecord.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.metrics.spi;
+
+import org.apache.hadoop.metrics.spi.AbstractMetricsContext.MetricMap;
+import org.apache.hadoop.metrics.spi.AbstractMetricsContext.TagMap;
+
+import junit.framework.TestCase;
+
+public class TestOutputRecord extends TestCase {
+ public void testCopy() {
+ TagMap tags = new TagMap();
+ tags.put("tagkey", "tagval");
+ MetricMap metrics = new MetricMap();
+ metrics.put("metrickey", 123.4);
+ OutputRecord r = new OutputRecord(tags, metrics);
+
+ assertEquals(tags, r.getTagsCopy());
+ assertNotSame(tags, r.getTagsCopy());
+ assertEquals(metrics, r.getMetricsCopy());
+ assertNotSame(metrics, r.getMetricsCopy());
+ }
+}
diff --git a/src/test/org/apache/hadoop/net/StaticMapping.java b/src/test/org/apache/hadoop/net/StaticMapping.java
new file mode 100644
index 00000000000..c3923ed9510
--- /dev/null
+++ b/src/test/org/apache/hadoop/net/StaticMapping.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.net;
+
+import java.util.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+
+/**
+ * Implements the {@link DNSToSwitchMapping} via static mappings. Used
+ * in testcases that simulate racks.
+ *
+ */
+public class StaticMapping extends Configured implements DNSToSwitchMapping {
+ public void setconf(Configuration conf) {
+ String[] mappings = conf.getStrings("hadoop.configured.node.mapping");
+ if (mappings != null) {
+ for (int i = 0; i < mappings.length; i++) {
+ String str = mappings[i];
+ String host = str.substring(0, str.indexOf('='));
+ String rack = str.substring(str.indexOf('=') + 1);
+ addNodeToRack(host, rack);
+ }
+ }
+ }
+ /* Only one instance per JVM */
+ private static Map nameToRackMap = new HashMap();
+
+ static synchronized public void addNodeToRack(String name, String rackId) {
+ nameToRackMap.put(name, rackId);
+ }
+ public List resolve(List names) {
+ List m = new ArrayList();
+ synchronized (nameToRackMap) {
+ for (String name : names) {
+ String rackId;
+ if ((rackId = nameToRackMap.get(name)) != null) {
+ m.add(rackId);
+ } else {
+ m.add(NetworkTopology.DEFAULT_RACK);
+ }
+ }
+ return m;
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/net/TestDNS.java b/src/test/org/apache/hadoop/net/TestDNS.java
new file mode 100644
index 00000000000..5825ecf8c63
--- /dev/null
+++ b/src/test/org/apache/hadoop/net/TestDNS.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.hadoop.net;
+
+import junit.framework.TestCase;
+
+import java.net.UnknownHostException;
+import java.net.InetAddress;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import javax.naming.NameNotFoundException;
+
+/**
+ *
+ */
+public class TestDNS extends TestCase {
+
+ private static final Log LOG = LogFactory.getLog(TestDNS.class);
+ private static final String DEFAULT = "default";
+
+ /**
+ * Constructs a test case with the given name.
+ *
+ * @param name test name
+ */
+ public TestDNS(String name) {
+ super(name);
+ }
+
+ /**
+ * Test that asking for the default hostname works
+ * @throws Exception if hostname lookups fail */
+ public void testGetLocalHost() throws Exception {
+ String hostname = DNS.getDefaultHost(DEFAULT);
+ assertNotNull(hostname);
+ }
+
+ /**
+ * Test that repeated calls to getting the local host are fairly fast, and
+ * hence that caching is being used
+ * @throws Exception if hostname lookups fail
+ */
+ public void testGetLocalHostIsFast() throws Exception {
+ String hostname = DNS.getDefaultHost(DEFAULT);
+ assertNotNull(hostname);
+ long t1 = System.currentTimeMillis();
+ String hostname2 = DNS.getDefaultHost(DEFAULT);
+ long t2 = System.currentTimeMillis();
+ String hostname3 = DNS.getDefaultHost(DEFAULT);
+ long t3 = System.currentTimeMillis();
+ assertEquals(hostname3, hostname2);
+ assertEquals(hostname2, hostname);
+ long interval2 = t3 - t2;
+ assertTrue(
+ "It is taking to long to determine the local host -caching is not working",
+ interval2 < 20000);
+ }
+
+ /**
+ * Test that our local IP address is not null
+ * @throws Exception if something went wrong
+ */
+ public void testLocalHostHasAnAddress() throws Exception {
+ assertNotNull(getLocalIPAddr());
+ }
+
+ private InetAddress getLocalIPAddr() throws UnknownHostException {
+ String hostname = DNS.getDefaultHost(DEFAULT);
+ InetAddress localhost = InetAddress.getByName(hostname);
+ return localhost;
+ }
+
+ /**
+ * Test that passing a null pointer is as the interface
+ * fails with a NullPointerException
+ * @throws Exception if something went wrong
+ */
+ public void testNullInterface() throws Exception {
+ try {
+ String host = DNS.getDefaultHost(null);
+ fail("Expected a NullPointerException, got " + host);
+ } catch (NullPointerException expected) {
+ //this is expected
+ }
+ }
+
+ /**
+ * Get the IP addresses of an unknown interface, expect to get something
+ * back
+ * @throws Exception if something went wrong
+ */
+ public void testIPsOfUnknownInterface() throws Exception {
+ String[] ips = DNS.getIPs("name-of-an-unknown-interface");
+ assertNotNull(ips);
+ assertTrue(ips.length > 0);
+ }
+
+ /**
+ * TestCase: get our local address and reverse look it up
+ * @throws Exception if that fails
+ */
+ public void testRDNS() throws Exception {
+ InetAddress localhost = getLocalIPAddr();
+ try {
+ String s = DNS.reverseDns(localhost, null);
+ LOG.info("Local revers DNS hostname is " + s);
+ } catch (NameNotFoundException e) {
+ if (!localhost.isLinkLocalAddress() || localhost.isLoopbackAddress()) {
+ //these addresses probably won't work with rDNS anyway, unless someone
+ //has unusual entries in their DNS server mapping 1.0.0.127 to localhost
+ LOG.info("Reverse DNS failing as due to incomplete networking", e);
+ LOG.info("Address is " + localhost
+ + " Loopback=" + localhost.isLoopbackAddress()
+ + " Linklocal=" + localhost.isLinkLocalAddress());
+ }
+
+ }
+ }
+
+ /**
+ * Test that the name "localhost" resolves to something.
+ *
+ * If this fails, your machine's network is in a mess, go edit /etc/hosts
+ * @throws Exception for any problems
+ */
+ public void testLocalhostResolves() throws Exception {
+ InetAddress localhost = InetAddress.getByName("localhost");
+ assertNotNull("localhost is null", localhost);
+ LOG.info("Localhost IPAddr is " + localhost.toString());
+ }
+}
diff --git a/src/test/org/apache/hadoop/net/TestScriptBasedMapping.java b/src/test/org/apache/hadoop/net/TestScriptBasedMapping.java
new file mode 100644
index 00000000000..144dbaa0e36
--- /dev/null
+++ b/src/test/org/apache/hadoop/net/TestScriptBasedMapping.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.net;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+
+import junit.framework.TestCase;
+
+public class TestScriptBasedMapping extends TestCase {
+
+ public void testNoArgsMeansNoResult() {
+ ScriptBasedMapping mapping = new ScriptBasedMapping();
+
+ Configuration conf = new Configuration();
+ conf.setInt(ScriptBasedMapping.SCRIPT_ARG_COUNT_KEY,
+ ScriptBasedMapping.MIN_ALLOWABLE_ARGS - 1);
+ conf.set(ScriptBasedMapping.SCRIPT_FILENAME_KEY, "any-filename");
+
+ mapping.setConf(conf);
+
+ List names = new ArrayList();
+ names.add("some.machine.name");
+ names.add("other.machine.name");
+
+ List result = mapping.resolve(names);
+ assertNull(result);
+ }
+}
diff --git a/src/test/org/apache/hadoop/net/TestSocketIOWithTimeout.java b/src/test/org/apache/hadoop/net/TestSocketIOWithTimeout.java
new file mode 100644
index 00000000000..53f320917c5
--- /dev/null
+++ b/src/test/org/apache/hadoop/net/TestSocketIOWithTimeout.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.net;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.SocketTimeoutException;
+import java.nio.channels.Pipe;
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import junit.framework.TestCase;
+
+/**
+ * This tests timout out from SocketInputStream and
+ * SocketOutputStream using pipes.
+ *
+ * Normal read and write using these streams are tested by pretty much
+ * every DFS unit test.
+ */
+public class TestSocketIOWithTimeout extends TestCase {
+
+ static Log LOG = LogFactory.getLog(TestSocketIOWithTimeout.class);
+
+ private static int TIMEOUT = 1*1000;
+ private static String TEST_STRING = "1234567890";
+
+ private void doIO(InputStream in, OutputStream out) throws IOException {
+ /* Keep on writing or reading until we get SocketTimeoutException.
+ * It expects this exception to occur within 100 millis of TIMEOUT.
+ */
+ byte buf[] = new byte[4192];
+
+ while (true) {
+ long start = System.currentTimeMillis();
+ try {
+ if (in != null) {
+ in.read(buf);
+ } else {
+ out.write(buf);
+ }
+ } catch (SocketTimeoutException e) {
+ long diff = System.currentTimeMillis() - start;
+ LOG.info("Got SocketTimeoutException as expected after " +
+ diff + " millis : " + e.getMessage());
+ assertTrue(Math.abs(TIMEOUT - diff) <= 200);
+ break;
+ }
+ }
+ }
+
+ /**
+ * Just reads one byte from the input stream.
+ */
+ static class ReadRunnable implements Runnable {
+ private InputStream in;
+
+ public ReadRunnable(InputStream in) {
+ this.in = in;
+ }
+ public void run() {
+ try {
+ in.read();
+ } catch (IOException e) {
+ LOG.info("Got expection while reading as expected : " +
+ e.getMessage());
+ return;
+ }
+ assertTrue(false);
+ }
+ }
+
+ public void testSocketIOWithTimeout() throws IOException {
+
+ // first open pipe:
+ Pipe pipe = Pipe.open();
+ Pipe.SourceChannel source = pipe.source();
+ Pipe.SinkChannel sink = pipe.sink();
+
+ try {
+ InputStream in = new SocketInputStream(source, TIMEOUT);
+ OutputStream out = new SocketOutputStream(sink, TIMEOUT);
+
+ byte[] writeBytes = TEST_STRING.getBytes();
+ byte[] readBytes = new byte[writeBytes.length];
+
+ out.write(writeBytes);
+ doIO(null, out);
+
+ in.read(readBytes);
+ assertTrue(Arrays.equals(writeBytes, readBytes));
+ doIO(in, null);
+
+ /*
+ * Verify that it handles interrupted threads properly.
+ * Use a large timeout and expect the thread to return quickly.
+ */
+ in = new SocketInputStream(source, 0);
+ Thread thread = new Thread(new ReadRunnable(in));
+ thread.start();
+
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException ignored) {}
+
+ thread.interrupt();
+
+ try {
+ thread.join();
+ } catch (InterruptedException e) {
+ throw new IOException("Unexpected InterruptedException : " + e);
+ }
+
+ //make sure the channels are still open
+ assertTrue(source.isOpen());
+ assertTrue(sink.isOpen());
+
+ out.close();
+ assertFalse(sink.isOpen());
+
+ // close sink and expect -1 from source.read()
+ assertEquals(-1, in.read());
+
+ // make sure close() closes the underlying channel.
+ in.close();
+ assertFalse(source.isOpen());
+
+ } finally {
+ if (source != null) {
+ source.close();
+ }
+ if (sink != null) {
+ sink.close();
+ }
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/record/FromCpp.java b/src/test/org/apache/hadoop/record/FromCpp.java
new file mode 100644
index 00000000000..2cd2271f43b
--- /dev/null
+++ b/src/test/org/apache/hadoop/record/FromCpp.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.record;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.TreeMap;
+import junit.framework.*;
+
+/**
+ */
+public class FromCpp extends TestCase {
+
+ public FromCpp(String testName) {
+ super(testName);
+ }
+
+ protected void setUp() throws Exception {
+ }
+
+ protected void tearDown() throws Exception {
+ }
+
+ public void testBinary() {
+ File tmpfile;
+ try {
+ tmpfile = new File("/temp/hadooptmp.dat");
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ FileInputStream istream = new FileInputStream(tmpfile);
+ BinaryRecordInput in = new BinaryRecordInput(istream);
+ RecRecord1 r2 = new RecRecord1();
+ r2.deserialize(in, "");
+ istream.close();
+ assertTrue(r1.equals(r2));
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public void testCsv() {
+ File tmpfile;
+ try {
+ tmpfile = new File("/temp/hadooptmp.txt");
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ FileInputStream istream = new FileInputStream(tmpfile);
+ CsvRecordInput in = new CsvRecordInput(istream);
+ RecRecord1 r2 = new RecRecord1();
+ r2.deserialize(in, "");
+ istream.close();
+ assertTrue(r1.equals(r2));
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public void testXml() {
+ File tmpfile;
+ try {
+ tmpfile = new File("/temp/hadooptmp.xml");
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ FileInputStream istream = new FileInputStream(tmpfile);
+ XmlRecordInput in = new XmlRecordInput(istream);
+ RecRecord1 r2 = new RecRecord1();
+ r2.deserialize(in, "");
+ istream.close();
+ assertTrue(r1.equals(r2));
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/record/RecordBench.java b/src/test/org/apache/hadoop/record/RecordBench.java
new file mode 100644
index 00000000000..1cba75ed804
--- /dev/null
+++ b/src/test/org/apache/hadoop/record/RecordBench.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.record;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.Random;
+
+/**
+ * Benchmark for various types of serializations
+ */
+public class RecordBench {
+
+ private static class Times {
+ long init;
+ long serialize;
+ long deserialize;
+ long write;
+ long readFields;
+ };
+
+ private static final long SEED = 0xDEADBEEFL;
+ private static final Random rand = new Random();
+
+ /** Do not allow to create a new instance of RecordBench */
+ private RecordBench() {}
+
+ private static void initBuffers(Record[] buffers) {
+ final int BUFLEN = 32;
+ for (int idx = 0; idx < buffers.length; idx++) {
+ buffers[idx] = new RecBuffer();
+ int buflen = rand.nextInt(BUFLEN);
+ byte[] bytes = new byte[buflen];
+ rand.nextBytes(bytes);
+ ((RecBuffer)buffers[idx]).setData(new Buffer(bytes));
+ }
+ }
+
+ private static void initStrings(Record[] strings) {
+ final int STRLEN = 32;
+ for (int idx = 0; idx < strings.length; idx++) {
+ strings[idx] = new RecString();
+ int strlen = rand.nextInt(STRLEN);
+ StringBuilder sb = new StringBuilder(strlen);
+ for (int ich = 0; ich < strlen; ich++) {
+ int cpt = 0;
+ while (true) {
+ cpt = rand.nextInt(0x10FFFF+1);
+ if (Utils.isValidCodePoint(cpt)) {
+ break;
+ }
+ }
+ sb.appendCodePoint(cpt);
+ }
+ ((RecString)strings[idx]).setData(sb.toString());
+ }
+ }
+
+ private static void initInts(Record[] ints) {
+ for (int idx = 0; idx < ints.length; idx++) {
+ ints[idx] = new RecInt();
+ ((RecInt)ints[idx]).setData(rand.nextInt());
+ }
+ }
+
+ private static Record[] makeArray(String type, int numRecords, Times times) {
+ Method init = null;
+ try {
+ init = RecordBench.class.getDeclaredMethod("init"+
+ toCamelCase(type) + "s",
+ new Class[] {Record[].class});
+ } catch (NoSuchMethodException ex) {
+ throw new RuntimeException(ex);
+ }
+
+ Record[] records = new Record[numRecords];
+ times.init = System.nanoTime();
+ try {
+ init.invoke(null, new Object[]{records});
+ } catch (Exception ex) {
+ throw new RuntimeException(ex);
+ }
+ times.init = System.nanoTime() - times.init;
+ return records;
+ }
+
+ private static void runBinaryBench(String type, int numRecords, Times times)
+ throws IOException {
+ Record[] records = makeArray(type, numRecords, times);
+ ByteArrayOutputStream bout = new ByteArrayOutputStream();
+ BinaryRecordOutput rout = new BinaryRecordOutput(bout);
+ DataOutputStream dout = new DataOutputStream(bout);
+
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].serialize(rout);
+ }
+ bout.reset();
+
+ times.serialize = System.nanoTime();
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].serialize(rout);
+ }
+ times.serialize = System.nanoTime() - times.serialize;
+
+ byte[] serialized = bout.toByteArray();
+ ByteArrayInputStream bin = new ByteArrayInputStream(serialized);
+ BinaryRecordInput rin = new BinaryRecordInput(bin);
+
+ times.deserialize = System.nanoTime();
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].deserialize(rin);
+ }
+ times.deserialize = System.nanoTime() - times.deserialize;
+
+ bout.reset();
+
+ times.write = System.nanoTime();
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].write(dout);
+ }
+ times.write = System.nanoTime() - times.write;
+
+ bin.reset();
+ DataInputStream din = new DataInputStream(bin);
+
+ times.readFields = System.nanoTime();
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].readFields(din);
+ }
+ times.readFields = System.nanoTime() - times.readFields;
+ }
+
+ private static void runCsvBench(String type, int numRecords, Times times)
+ throws IOException {
+ Record[] records = makeArray(type, numRecords, times);
+ ByteArrayOutputStream bout = new ByteArrayOutputStream();
+ CsvRecordOutput rout = new CsvRecordOutput(bout);
+
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].serialize(rout);
+ }
+ bout.reset();
+
+ times.serialize = System.nanoTime();
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].serialize(rout);
+ }
+ times.serialize = System.nanoTime() - times.serialize;
+
+ byte[] serialized = bout.toByteArray();
+ ByteArrayInputStream bin = new ByteArrayInputStream(serialized);
+ CsvRecordInput rin = new CsvRecordInput(bin);
+
+ times.deserialize = System.nanoTime();
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].deserialize(rin);
+ }
+ times.deserialize = System.nanoTime() - times.deserialize;
+ }
+
+ private static void runXmlBench(String type, int numRecords, Times times)
+ throws IOException {
+ Record[] records = makeArray(type, numRecords, times);
+ ByteArrayOutputStream bout = new ByteArrayOutputStream();
+ XmlRecordOutput rout = new XmlRecordOutput(bout);
+
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].serialize(rout);
+ }
+ bout.reset();
+
+ bout.write("\n".getBytes());
+
+ times.serialize = System.nanoTime();
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].serialize(rout);
+ }
+ times.serialize = System.nanoTime() - times.serialize;
+
+ bout.write("\n".getBytes());
+
+ byte[] serialized = bout.toByteArray();
+ ByteArrayInputStream bin = new ByteArrayInputStream(serialized);
+
+ times.deserialize = System.nanoTime();
+ XmlRecordInput rin = new XmlRecordInput(bin);
+ for(int idx = 0; idx < numRecords; idx++) {
+ records[idx].deserialize(rin);
+ }
+ times.deserialize = System.nanoTime() - times.deserialize;
+ }
+
+ private static void printTimes(String type,
+ String format,
+ int numRecords,
+ Times times) {
+ System.out.println("Type: " + type + " Format: " + format +
+ " #Records: "+numRecords);
+ if (times.init != 0) {
+ System.out.println("Initialization Time (Per record) : "+
+ times.init/numRecords + " Nanoseconds");
+ }
+
+ if (times.serialize != 0) {
+ System.out.println("Serialization Time (Per Record) : "+
+ times.serialize/numRecords + " Nanoseconds");
+ }
+
+ if (times.deserialize != 0) {
+ System.out.println("Deserialization Time (Per Record) : "+
+ times.deserialize/numRecords + " Nanoseconds");
+ }
+
+ if (times.write != 0) {
+ System.out.println("Write Time (Per Record) : "+
+ times.write/numRecords + " Nanoseconds");
+ }
+
+ if (times.readFields != 0) {
+ System.out.println("ReadFields Time (Per Record) : "+
+ times.readFields/numRecords + " Nanoseconds");
+ }
+
+ System.out.println();
+ }
+
+ private static String toCamelCase(String inp) {
+ char firstChar = inp.charAt(0);
+ if (Character.isLowerCase(firstChar)) {
+ return ""+Character.toUpperCase(firstChar) + inp.substring(1);
+ }
+ return inp;
+ }
+
+ private static void exitOnError() {
+ String usage = "RecordBench {buffer|string|int}"+
+ " {binary|csv|xml} ";
+ System.out.println(usage);
+ System.exit(1);
+ }
+
+ /**
+ * @param args the command line arguments
+ */
+ public static void main(String[] args) throws IOException {
+ String version = "RecordBench v0.1";
+ System.out.println(version+"\n");
+
+ if (args.length != 3) {
+ exitOnError();
+ }
+
+ String typeName = args[0];
+ String format = args[1];
+ int numRecords = Integer.decode(args[2]).intValue();
+
+ Method bench = null;
+ try {
+ bench = RecordBench.class.getDeclaredMethod("run"+
+ toCamelCase(format) + "Bench",
+ new Class[] {String.class, Integer.TYPE, Times.class});
+ } catch (NoSuchMethodException ex) {
+ ex.printStackTrace();
+ exitOnError();
+ }
+
+ if (numRecords < 0) {
+ exitOnError();
+ }
+
+ // dry run
+ rand.setSeed(SEED);
+ Times times = new Times();
+ try {
+ bench.invoke(null, new Object[] {typeName, numRecords, times});
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ System.exit(1);
+ }
+
+ // timed run
+ rand.setSeed(SEED);
+ try {
+ bench.invoke(null, new Object[] {typeName, numRecords, times});
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ System.exit(1);
+ }
+ printTimes(typeName, format, numRecords, times);
+ }
+}
diff --git a/src/test/org/apache/hadoop/record/TestBuffer.java b/src/test/org/apache/hadoop/record/TestBuffer.java
new file mode 100644
index 00000000000..3012fa6ff46
--- /dev/null
+++ b/src/test/org/apache/hadoop/record/TestBuffer.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.record;
+
+import junit.framework.*;
+
+/**
+ * A Unit test for Record I/O Buffer class
+ */
+public class TestBuffer extends TestCase {
+
+ public TestBuffer(String testName) {
+ super(testName);
+ }
+
+ /**
+ * Test of set method, of class org.apache.hadoop.record.Buffer.
+ */
+ public void testSet() {
+ final byte[] bytes = new byte[10];
+ final Buffer instance = new Buffer();
+
+ instance.set(bytes);
+
+ assertEquals("set failed", bytes, instance.get());
+ }
+
+ /**
+ * Test of copy method, of class org.apache.hadoop.record.Buffer.
+ */
+ public void testCopy() {
+ final byte[] bytes = new byte[10];
+ final int offset = 6;
+ final int length = 3;
+ for (int idx = 0; idx < 10; idx ++) {
+ bytes[idx] = (byte) idx;
+ }
+ final Buffer instance = new Buffer();
+
+ instance.copy(bytes, offset, length);
+
+ assertEquals("copy failed", 3, instance.getCapacity());
+ assertEquals("copy failed", 3, instance.get().length);
+ for (int idx = 0; idx < 3; idx++) {
+ assertEquals("Buffer content corrupted", idx+6, instance.get()[idx]);
+ }
+ }
+
+ /**
+ * Test of getCount method, of class org.apache.hadoop.record.Buffer.
+ */
+ public void testGetCount() {
+ final Buffer instance = new Buffer();
+
+ final int expResult = 0;
+ final int result = instance.getCount();
+ assertEquals("getSize failed", expResult, result);
+ }
+
+ /**
+ * Test of getCapacity method, of class org.apache.hadoop.record.Buffer.
+ */
+ public void testGetCapacity() {
+ final Buffer instance = new Buffer();
+
+ final int expResult = 0;
+ final int result = instance.getCapacity();
+ assertEquals("getCapacity failed", expResult, result);
+
+ instance.setCapacity(100);
+ assertEquals("setCapacity failed", 100, instance.getCapacity());
+ }
+
+ /**
+ * Test of truncate method, of class org.apache.hadoop.record.Buffer.
+ */
+ public void testTruncate() {
+ final Buffer instance = new Buffer();
+ instance.setCapacity(100);
+ assertEquals("setCapacity failed", 100, instance.getCapacity());
+
+ instance.truncate();
+ assertEquals("truncate failed", 0, instance.getCapacity());
+ }
+
+ /**
+ * Test of append method, of class org.apache.hadoop.record.Buffer.
+ */
+ public void testAppend() {
+ final byte[] bytes = new byte[100];
+ final int offset = 0;
+ final int length = 100;
+ for (int idx = 0; idx < 100; idx++) {
+ bytes[idx] = (byte) (100-idx);
+ }
+
+ final Buffer instance = new Buffer();
+
+ instance.append(bytes, offset, length);
+
+ assertEquals("Buffer size mismatch", 100, instance.getCount());
+
+ for (int idx = 0; idx < 100; idx++) {
+ assertEquals("Buffer contents corrupted", 100-idx, instance.get()[idx]);
+ }
+
+ }
+}
diff --git a/src/test/org/apache/hadoop/record/TestRecordIO.java b/src/test/org/apache/hadoop/record/TestRecordIO.java
new file mode 100644
index 00000000000..163ec1b00b2
--- /dev/null
+++ b/src/test/org/apache/hadoop/record/TestRecordIO.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.record;
+
+import java.io.IOException;
+import junit.framework.*;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.util.ArrayList;
+import java.util.TreeMap;
+
+/**
+ */
+public class TestRecordIO extends TestCase {
+
+ public TestRecordIO(String testName) {
+ super(testName);
+ }
+
+ protected void setUp() throws Exception {
+ }
+
+ protected void tearDown() throws Exception {
+ }
+
+ public void testBinary() {
+ File tmpfile;
+ try {
+ tmpfile = File.createTempFile("hadooprec", ".dat");
+ FileOutputStream ostream = new FileOutputStream(tmpfile);
+ BinaryRecordOutput out = new BinaryRecordOutput(ostream);
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(-4567);
+ r1.setLongVal(-2367L);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ RecRecord0 r0 = new RecRecord0();
+ r0.setStringVal("other random text");
+ r1.setRecordVal(r0);
+ r1.serialize(out, "");
+ ostream.close();
+ FileInputStream istream = new FileInputStream(tmpfile);
+ BinaryRecordInput in = new BinaryRecordInput(istream);
+ RecRecord1 r2 = new RecRecord1();
+ r2.deserialize(in, "");
+ istream.close();
+ tmpfile.delete();
+ assertTrue("Serialized and deserialized records do not match.", r1.equals(r2));
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public void testCsv() {
+ File tmpfile;
+ try {
+ tmpfile = File.createTempFile("hadooprec", ".txt");
+ FileOutputStream ostream = new FileOutputStream(tmpfile);
+ CsvRecordOutput out = new CsvRecordOutput(ostream);
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ RecRecord0 r0 = new RecRecord0();
+ r0.setStringVal("other random text");
+ r1.setRecordVal(r0);
+ r1.serialize(out, "");
+ ostream.close();
+ FileInputStream istream = new FileInputStream(tmpfile);
+ CsvRecordInput in = new CsvRecordInput(istream);
+ RecRecord1 r2 = new RecRecord1();
+ r2.deserialize(in, "");
+ istream.close();
+ tmpfile.delete();
+ assertTrue("Serialized and deserialized records do not match.", r1.equals(r2));
+
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public void testToString() {
+ try {
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("random text");
+ byte[] barr = new byte[256];
+ for (int idx = 0; idx < 256; idx++) {
+ barr[idx] = (byte) idx;
+ }
+ r1.setBufferVal(new Buffer(barr));
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ RecRecord0 r0 = new RecRecord0();
+ r0.setStringVal("other random text");
+ r1.setRecordVal(r0);
+ System.err.println("Illustrating toString bug"+r1.toString());
+ System.err.println("Illustrating toString bug"+r1.toString());
+ } catch (Throwable ex) {
+ assertTrue("Record.toString cannot be invoked twice in succession."+
+ "This bug has been fixed in the latest version.", false);
+ }
+ }
+
+ public void testXml() {
+ File tmpfile;
+ try {
+ tmpfile = File.createTempFile("hadooprec", ".xml");
+ FileOutputStream ostream = new FileOutputStream(tmpfile);
+ XmlRecordOutput out = new XmlRecordOutput(ostream);
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("ran\002dom < %text<&more\uffff");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ RecRecord0 r0 = new RecRecord0();
+ r0.setStringVal("other %rando\007m & >&more text");
+ r1.setRecordVal(r0);
+ r1.serialize(out, "");
+ ostream.close();
+ FileInputStream istream = new FileInputStream(tmpfile);
+ XmlRecordInput in = new XmlRecordInput(istream);
+ RecRecord1 r2 = new RecRecord1();
+ r2.deserialize(in, "");
+ istream.close();
+ tmpfile.delete();
+ assertTrue("Serialized and deserialized records do not match.", r1.equals(r2));
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public void testCloneable() {
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(-4567);
+ r1.setLongVal(-2367L);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ RecRecord0 r0 = new RecRecord0();
+ r0.setStringVal("other random text");
+ r1.setRecordVal(r0);
+ try {
+ RecRecord1 r2 = (RecRecord1) r1.clone();
+ assertTrue("Cloneable semantics violated. r1==r2", r1 != r2);
+ assertTrue("Cloneable semantics violated. r1.getClass() != r2.getClass()",
+ r1.getClass() == r2.getClass());
+ assertTrue("Cloneable semantics violated. !r2.equals(r1)", r2.equals(r1));
+ } catch (final CloneNotSupportedException ex) {
+ ex.printStackTrace();
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/record/TestRecordVersioning.java b/src/test/org/apache/hadoop/record/TestRecordVersioning.java
new file mode 100644
index 00000000000..129ba2ced86
--- /dev/null
+++ b/src/test/org/apache/hadoop/record/TestRecordVersioning.java
@@ -0,0 +1,239 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.record;
+
+import java.io.IOException;
+import junit.framework.*;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.util.ArrayList;
+import java.util.TreeMap;
+import org.apache.hadoop.record.meta.RecordTypeInfo;
+
+/**
+ */
+public class TestRecordVersioning extends TestCase {
+
+ public TestRecordVersioning(String testName) {
+ super(testName);
+ }
+
+ protected void setUp() throws Exception {
+ }
+
+ protected void tearDown() throws Exception {
+ }
+
+ /*
+ * basic versioning
+ * write out a record and its type info, read it back using its typeinfo
+ */
+ public void testBasic() {
+ File tmpfile, tmpRTIfile;
+ try {
+ tmpfile = File.createTempFile("hadooprec", ".dat");
+ tmpRTIfile = File.createTempFile("hadooprti", ".dat");
+ FileOutputStream ostream = new FileOutputStream(tmpfile);
+ BinaryRecordOutput out = new BinaryRecordOutput(ostream);
+ FileOutputStream oRTIstream = new FileOutputStream(tmpRTIfile);
+ BinaryRecordOutput outRTI = new BinaryRecordOutput(oRTIstream);
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(-4567);
+ r1.setLongVal(-2367L);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ RecRecord0 r0 = new RecRecord0();
+ r0.setStringVal("other random text");
+ r1.setRecordVal(r0);
+ r1.serialize(out, "");
+ ostream.close();
+ // write out the type info
+ RecRecord1.getTypeInfo().serialize(outRTI);
+ oRTIstream.close();
+
+ // read
+ FileInputStream istream = new FileInputStream(tmpfile);
+ BinaryRecordInput in = new BinaryRecordInput(istream);
+ FileInputStream iRTIstream = new FileInputStream(tmpRTIfile);
+ BinaryRecordInput inRTI = new BinaryRecordInput(iRTIstream);
+ RecordTypeInfo rti = new RecordTypeInfo();
+ rti.deserialize(inRTI);
+ iRTIstream.close();
+ RecRecord1.setTypeFilter(rti);
+ RecRecord1 r2 = new RecRecord1();
+ r2.deserialize(in, "");
+ istream.close();
+ tmpfile.delete();
+ tmpRTIfile.delete();
+ assertTrue("Serialized and deserialized versioned records do not match.", r1.equals(r2));
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ /*
+ * versioning
+ * write out a record and its type info, read back a similar record using the written record's typeinfo
+ */
+ public void testVersioning() {
+ File tmpfile, tmpRTIfile;
+ try {
+ tmpfile = File.createTempFile("hadooprec", ".dat");
+ tmpRTIfile = File.createTempFile("hadooprti", ".dat");
+ FileOutputStream ostream = new FileOutputStream(tmpfile);
+ BinaryRecordOutput out = new BinaryRecordOutput(ostream);
+ FileOutputStream oRTIstream = new FileOutputStream(tmpRTIfile);
+ BinaryRecordOutput outRTI = new BinaryRecordOutput(oRTIstream);
+
+ // we create an array of records to write
+ ArrayList recsWrite = new ArrayList();
+ int i, j, k, l;
+ for (i=0; i<5; i++) {
+ RecRecordOld s1Rec = new RecRecordOld();
+
+ s1Rec.setName("This is record s1: " + i);
+
+ ArrayList iA = new ArrayList();
+ for (j=0; j<3; j++) {
+ iA.add(new Long(i+j));
+ }
+ s1Rec.setIvec(iA);
+
+ ArrayList> ssVec = new ArrayList>();
+ for (j=0; j<2; j++) {
+ ArrayList sVec = new ArrayList();
+ for (k=0; k<3; k++) {
+ RecRecord0 sRec = new RecRecord0("This is record s: ("+j+": "+k+")");
+ sVec.add(sRec);
+ }
+ ssVec.add(sVec);
+ }
+ s1Rec.setSvec(ssVec);
+
+ s1Rec.setInner(new RecRecord0("This is record s: " + i));
+
+ ArrayList>> aaaVec = new ArrayList>>();
+ for (l=0; l<2; l++) {
+ ArrayList> aaVec = new ArrayList>();
+ for (j=0; j<2; j++) {
+ ArrayList aVec = new ArrayList();
+ for (k=0; k<3; k++) {
+ aVec.add(new String("THis is a nested string: (" + l + ": " + j + ": " + k + ")"));
+ }
+ aaVec.add(aVec);
+ }
+ aaaVec.add(aaVec);
+ }
+ s1Rec.setStrvec(aaaVec);
+
+ s1Rec.setI1(100+i);
+
+ java.util.TreeMap map1 = new java.util.TreeMap();
+ map1.put(new Byte("23"), "23");
+ map1.put(new Byte("11"), "11");
+ s1Rec.setMap1(map1);
+
+ java.util.TreeMap m1 = new java.util.TreeMap();
+ java.util.TreeMap m2 = new java.util.TreeMap();
+ m1.put(new Integer(5), 5L);
+ m1.put(new Integer(10), 10L);
+ m2.put(new Integer(15), 15L);
+ m2.put(new Integer(20), 20L);
+ java.util.ArrayList> vm1 = new java.util.ArrayList>();
+ vm1.add(m1);
+ vm1.add(m2);
+ s1Rec.setMvec1(vm1);
+ java.util.ArrayList> vm2 = new java.util.ArrayList>();
+ vm2.add(m1);
+ s1Rec.setMvec2(vm2);
+
+ // add to our list
+ recsWrite.add(s1Rec);
+ }
+
+ // write out to file
+ for (RecRecordOld rec: recsWrite) {
+ rec.serialize(out);
+ }
+ ostream.close();
+ // write out the type info
+ RecRecordOld.getTypeInfo().serialize(outRTI);
+ oRTIstream.close();
+
+ // read
+ FileInputStream istream = new FileInputStream(tmpfile);
+ BinaryRecordInput in = new BinaryRecordInput(istream);
+ FileInputStream iRTIstream = new FileInputStream(tmpRTIfile);
+ BinaryRecordInput inRTI = new BinaryRecordInput(iRTIstream);
+ RecordTypeInfo rti = new RecordTypeInfo();
+
+ // read type info
+ rti.deserialize(inRTI);
+ iRTIstream.close();
+ RecRecordNew.setTypeFilter(rti);
+
+ // read records
+ ArrayList recsRead = new ArrayList();
+ for (i=0; i> ss2Vec = s2In.getStrvec().get(j);
+ ArrayList> ss1Vec = s1Out.getStrvec().get(j);
+ for (k=0; k s2Vec = ss2Vec.get(k);
+ ArrayList s1Vec = ss1Vec.get(k);
+ for (l=0; l());
+ r1.setMapVal(new TreeMap());
+ r1.serialize(out, "");
+ ostream.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public void testCsv() {
+ File tmpfile;
+ try {
+ tmpfile = new File("/tmp/hadooptemp.txt");
+ FileOutputStream ostream = new FileOutputStream(tmpfile);
+ CsvRecordOutput out = new CsvRecordOutput(ostream);
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ r1.serialize(out, "");
+ ostream.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public void testXml() {
+ File tmpfile;
+ try {
+ tmpfile = new File("/tmp/hadooptemp.xml");
+ FileOutputStream ostream = new FileOutputStream(tmpfile);
+ XmlRecordOutput out = new XmlRecordOutput(ostream);
+ RecRecord1 r1 = new RecRecord1();
+ r1.setBoolVal(true);
+ r1.setByteVal((byte)0x66);
+ r1.setFloatVal(3.145F);
+ r1.setDoubleVal(1.5234);
+ r1.setIntVal(4567);
+ r1.setLongVal(0x5a5a5a5a5a5aL);
+ r1.setStringVal("random text");
+ r1.setBufferVal(new Buffer());
+ r1.setVectorVal(new ArrayList());
+ r1.setMapVal(new TreeMap());
+ r1.serialize(out, "");
+ ostream.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/security/TestAccessControlList.java b/src/test/org/apache/hadoop/security/TestAccessControlList.java
new file mode 100644
index 00000000000..57c5abf875a
--- /dev/null
+++ b/src/test/org/apache/hadoop/security/TestAccessControlList.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.security;
+
+import java.util.Iterator;
+import java.util.Set;
+
+import org.apache.hadoop.security.SecurityUtil.AccessControlList;
+
+import junit.framework.TestCase;
+
+public class TestAccessControlList extends TestCase {
+
+ public void testWildCardAccessControlList() throws Exception {
+ AccessControlList acl;
+
+ acl = new AccessControlList("*");
+ assertTrue(acl.allAllowed());
+
+ acl = new AccessControlList(" * ");
+ assertTrue(acl.allAllowed());
+
+ acl = new AccessControlList(" *");
+ assertTrue(acl.allAllowed());
+
+ acl = new AccessControlList("* ");
+ assertTrue(acl.allAllowed());
+ }
+
+ public void testAccessControlList() throws Exception {
+ AccessControlList acl;
+ Set users;
+ Set groups;
+
+ acl = new AccessControlList("drwho tardis");
+ users = acl.getUsers();
+ assertEquals(users.size(), 1);
+ assertEquals(users.iterator().next(), "drwho");
+ groups = acl.getGroups();
+ assertEquals(groups.size(), 1);
+ assertEquals(groups.iterator().next(), "tardis");
+
+ acl = new AccessControlList("drwho");
+ users = acl.getUsers();
+ assertEquals(users.size(), 1);
+ assertEquals(users.iterator().next(), "drwho");
+ groups = acl.getGroups();
+ assertEquals(groups.size(), 0);
+
+ acl = new AccessControlList("drwho ");
+ users = acl.getUsers();
+ assertEquals(users.size(), 1);
+ assertEquals(users.iterator().next(), "drwho");
+ groups = acl.getGroups();
+ assertEquals(groups.size(), 0);
+
+ acl = new AccessControlList(" tardis");
+ users = acl.getUsers();
+ assertEquals(users.size(), 0);
+ groups = acl.getGroups();
+ assertEquals(groups.size(), 1);
+ assertEquals(groups.iterator().next(), "tardis");
+
+ Iterator iter;
+ acl = new AccessControlList("drwho,joe tardis,users");
+ users = acl.getUsers();
+ assertEquals(users.size(), 2);
+ iter = users.iterator();
+ assertEquals(iter.next(), "drwho");
+ assertEquals(iter.next(), "joe");
+ groups = acl.getGroups();
+ assertEquals(groups.size(), 2);
+ iter = groups.iterator();
+ assertEquals(iter.next(), "tardis");
+ assertEquals(iter.next(), "users");
+
+ acl = new AccessControlList("drwho,joe tardis, users");
+ users = acl.getUsers();
+ assertEquals(users.size(), 2);
+ iter = users.iterator();
+ assertEquals(iter.next(), "drwho");
+ assertEquals(iter.next(), "joe");
+ groups = acl.getGroups();
+ assertEquals(groups.size(), 2);
+ iter = groups.iterator();
+ assertEquals(iter.next(), "tardis");
+ assertEquals(iter.next(), "users");
+ }
+}
diff --git a/src/test/org/apache/hadoop/security/TestAccessToken.java b/src/test/org/apache/hadoop/security/TestAccessToken.java
new file mode 100644
index 00000000000..cd3cc4c482a
--- /dev/null
+++ b/src/test/org/apache/hadoop/security/TestAccessToken.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.security;
+
+import java.util.EnumSet;
+
+import org.apache.hadoop.io.TestWritable;
+
+import junit.framework.TestCase;
+
+/** Unit tests for access tokens */
+public class TestAccessToken extends TestCase {
+ long accessKeyUpdateInterval = 10 * 60 * 1000; // 10 mins
+ long accessTokenLifetime = 2 * 60 * 1000; // 2 mins
+ long blockID1 = 0L;
+ long blockID2 = 10L;
+ long blockID3 = -108L;
+
+ /** test Writable */
+ public void testWritable() throws Exception {
+ TestWritable.testWritable(ExportedAccessKeys.DUMMY_KEYS);
+ AccessTokenHandler handler = new AccessTokenHandler(true,
+ accessKeyUpdateInterval, accessTokenLifetime);
+ ExportedAccessKeys keys = handler.exportKeys();
+ TestWritable.testWritable(keys);
+ TestWritable.testWritable(AccessToken.DUMMY_TOKEN);
+ AccessToken token = handler.generateToken(blockID3, EnumSet
+ .allOf(AccessTokenHandler.AccessMode.class));
+ TestWritable.testWritable(token);
+ }
+
+ private void tokenGenerationAndVerification(AccessTokenHandler master,
+ AccessTokenHandler slave) throws Exception {
+ // single-mode tokens
+ for (AccessTokenHandler.AccessMode mode : AccessTokenHandler.AccessMode
+ .values()) {
+ // generated by master
+ AccessToken token1 = master.generateToken(blockID1, EnumSet.of(mode));
+ assertTrue(master.checkAccess(token1, null, blockID1, mode));
+ assertTrue(slave.checkAccess(token1, null, blockID1, mode));
+ // generated by slave
+ AccessToken token2 = slave.generateToken(blockID2, EnumSet.of(mode));
+ assertTrue(master.checkAccess(token2, null, blockID2, mode));
+ assertTrue(slave.checkAccess(token2, null, blockID2, mode));
+ }
+ // multi-mode tokens
+ AccessToken mtoken = master.generateToken(blockID3, EnumSet
+ .allOf(AccessTokenHandler.AccessMode.class));
+ for (AccessTokenHandler.AccessMode mode : AccessTokenHandler.AccessMode
+ .values()) {
+ assertTrue(master.checkAccess(mtoken, null, blockID3, mode));
+ assertTrue(slave.checkAccess(mtoken, null, blockID3, mode));
+ }
+ }
+
+ /** test access key and token handling */
+ public void testAccessTokenHandler() throws Exception {
+ AccessTokenHandler masterHandler = new AccessTokenHandler(true,
+ accessKeyUpdateInterval, accessTokenLifetime);
+ AccessTokenHandler slaveHandler = new AccessTokenHandler(false,
+ accessKeyUpdateInterval, accessTokenLifetime);
+ ExportedAccessKeys keys = masterHandler.exportKeys();
+ slaveHandler.setKeys(keys);
+ tokenGenerationAndVerification(masterHandler, slaveHandler);
+ // key updating
+ masterHandler.updateKeys();
+ tokenGenerationAndVerification(masterHandler, slaveHandler);
+ keys = masterHandler.exportKeys();
+ slaveHandler.setKeys(keys);
+ tokenGenerationAndVerification(masterHandler, slaveHandler);
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/security/TestUnixUserGroupInformation.java b/src/test/org/apache/hadoop/security/TestUnixUserGroupInformation.java
new file mode 100644
index 00000000000..51880c2d1f6
--- /dev/null
+++ b/src/test/org/apache/hadoop/security/TestUnixUserGroupInformation.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.security;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.TestWritable;
+
+import junit.framework.TestCase;
+
+/** Unit tests for UnixUserGroupInformation */
+public class TestUnixUserGroupInformation extends TestCase {
+ final private static String USER_NAME = "user1";
+ final private static String GROUP1_NAME = "group1";
+ final private static String GROUP2_NAME = "group2";
+ final private static String GROUP3_NAME = "group3";
+ final private static String[] GROUP_NAMES =
+ new String[]{GROUP1_NAME, GROUP2_NAME, GROUP3_NAME};
+
+ /** Test login method */
+ public void testLogin() throws Exception {
+ Configuration conf = new Configuration();
+
+ // loin from unix
+ String userName = UnixUserGroupInformation.getUnixUserName();
+ UnixUserGroupInformation curUserGroupInfo =
+ UnixUserGroupInformation.login(conf);
+ assertEquals(curUserGroupInfo.getUserName(), userName);
+ assertTrue(curUserGroupInfo == UnixUserGroupInformation.login(conf));
+
+ // login from the configuration
+ UnixUserGroupInformation userGroupInfo = new UnixUserGroupInformation(
+ USER_NAME, GROUP_NAMES );
+ UnixUserGroupInformation.saveToConf(conf,
+ UnixUserGroupInformation.UGI_PROPERTY_NAME, userGroupInfo);
+ curUserGroupInfo = UnixUserGroupInformation.login(conf);
+ assertEquals(curUserGroupInfo, userGroupInfo);
+ assertTrue(curUserGroupInfo == UnixUserGroupInformation.login(conf));
+ }
+
+ /** test constructor */
+ public void testConstructor() throws Exception {
+ UnixUserGroupInformation uugi =
+ new UnixUserGroupInformation(USER_NAME, GROUP_NAMES);
+ assertEquals(uugi, new UnixUserGroupInformation( new String[]{
+ USER_NAME, GROUP1_NAME, GROUP2_NAME, GROUP3_NAME} ));
+ // failure test
+ testConstructorFailures(null, GROUP_NAMES);
+ testConstructorFailures("", GROUP_NAMES);
+ testConstructorFailures(USER_NAME, null);
+ testConstructorFailures(USER_NAME, new String[0]);
+ testConstructorFailures(USER_NAME, new String[]{null});
+ testConstructorFailures(USER_NAME, new String[]{""});
+ testConstructorFailures(USER_NAME, new String[]{GROUP1_NAME, null});
+ testConstructorFailures(USER_NAME,
+ new String[]{GROUP1_NAME, null, GROUP2_NAME});
+ }
+
+ private void testConstructorFailures(String userName, String[] groupNames) {
+ boolean gotException = false;
+ try {
+ new UnixUserGroupInformation(userName, groupNames);
+ } catch (Exception e) {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ public void testEquals() throws Exception {
+ UnixUserGroupInformation uugi =
+ new UnixUserGroupInformation(USER_NAME, GROUP_NAMES);
+
+ assertEquals(uugi, uugi);
+ assertEquals(uugi, new UnixUserGroupInformation(USER_NAME, GROUP_NAMES));
+ assertEquals(uugi, new UnixUserGroupInformation(USER_NAME,
+ new String[]{GROUP1_NAME, GROUP3_NAME, GROUP2_NAME}));
+ assertFalse(uugi.equals(new UnixUserGroupInformation()));
+ assertFalse(uugi.equals(new UnixUserGroupInformation(USER_NAME,
+ new String[]{GROUP2_NAME, GROUP3_NAME, GROUP1_NAME})));
+ }
+
+ /** test Writable */
+ public void testWritable() throws Exception {
+ UnixUserGroupInformation ugi = new UnixUserGroupInformation(
+ USER_NAME, GROUP_NAMES);
+ TestWritable.testWritable(ugi, new Configuration());
+ }
+}
diff --git a/src/test/org/apache/hadoop/security/authorize/TestConfiguredPolicy.java b/src/test/org/apache/hadoop/security/authorize/TestConfiguredPolicy.java
new file mode 100644
index 00000000000..203946cabd8
--- /dev/null
+++ b/src/test/org/apache/hadoop/security/authorize/TestConfiguredPolicy.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.security.authorize;
+
+import java.security.Permission;
+
+import javax.security.auth.Subject;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.security.UnixUserGroupInformation;
+import org.apache.hadoop.security.SecurityUtil.AccessControlList;
+
+import junit.framework.TestCase;
+
+public class TestConfiguredPolicy extends TestCase {
+ private static final String USER1 = "drwho";
+ private static final String USER2 = "joe";
+ private static final String[] GROUPS1 = new String[]{"tardis"};
+ private static final String[] GROUPS2 = new String[]{"users"};
+
+ private static final String KEY_1 = "test.policy.1";
+ private static final String KEY_2 = "test.policy.2";
+
+ public static class Protocol1 {
+ int i;
+ }
+ public static class Protocol2 {
+ int j;
+ }
+
+ private static class TestPolicyProvider extends PolicyProvider {
+ @Override
+ public Service[] getServices() {
+ return new Service[] {
+ new Service(KEY_1, Protocol1.class),
+ new Service(KEY_2, Protocol2.class),
+ };
+ }
+ }
+
+ public void testConfiguredPolicy() throws Exception {
+ Configuration conf = new Configuration();
+ conf.set(KEY_1, AccessControlList.WILDCARD_ACL_VALUE);
+ conf.set(KEY_2, USER1 + " " + GROUPS1[0]);
+
+ ConfiguredPolicy policy = new ConfiguredPolicy(conf, new TestPolicyProvider());
+ SecurityUtil.setPolicy(policy);
+
+ Subject user1 =
+ SecurityUtil.getSubject(new UnixUserGroupInformation(USER1, GROUPS1));
+
+ // Should succeed
+ ServiceAuthorizationManager.authorize(user1, Protocol1.class);
+
+ // Should fail
+ Subject user2 =
+ SecurityUtil.getSubject(new UnixUserGroupInformation(USER2, GROUPS2));
+ boolean failed = false;
+ try {
+ ServiceAuthorizationManager.authorize(user2, Protocol2.class);
+ } catch (AuthorizationException ae) {
+ failed = true;
+ }
+ assertTrue(failed);
+ }
+}
diff --git a/src/test/org/apache/hadoop/test/CoreTestDriver.java b/src/test/org/apache/hadoop/test/CoreTestDriver.java
new file mode 100644
index 00000000000..06590c9cdf8
--- /dev/null
+++ b/src/test/org/apache/hadoop/test/CoreTestDriver.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.test;
+
+import org.apache.hadoop.io.TestArrayFile;
+import org.apache.hadoop.io.TestSetFile;
+import org.apache.hadoop.ipc.TestIPC;
+import org.apache.hadoop.ipc.TestRPC;
+import org.apache.hadoop.util.ProgramDriver;
+
+/**
+ * Driver for core tests.
+ */
+public class CoreTestDriver {
+
+ private ProgramDriver pgd;
+
+ public CoreTestDriver() {
+ this(new ProgramDriver());
+ }
+
+ public CoreTestDriver(ProgramDriver pgd) {
+ this.pgd = pgd;
+ try {
+ pgd.addClass("testsetfile", TestSetFile.class,
+ "A test for flat files of binary key/value pairs.");
+ pgd.addClass("testarrayfile", TestArrayFile.class,
+ "A test for flat files of binary key/value pairs.");
+ pgd.addClass("testrpc", TestRPC.class, "A test for rpc.");
+ pgd.addClass("testipc", TestIPC.class, "A test for ipc.");
+ } catch(Throwable e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void run(String argv[]) {
+ try {
+ pgd.driver(argv);
+ } catch(Throwable e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static void main(String argv[]){
+ new CoreTestDriver().run(argv);
+ }
+}
diff --git a/src/test/org/apache/hadoop/util/TestCyclicIteration.java b/src/test/org/apache/hadoop/util/TestCyclicIteration.java
new file mode 100644
index 00000000000..7dfa4763e19
--- /dev/null
+++ b/src/test/org/apache/hadoop/util/TestCyclicIteration.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.util;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.TreeMap;
+
+public class TestCyclicIteration extends junit.framework.TestCase {
+ public void testCyclicIteration() throws Exception {
+ for(int n = 0; n < 5; n++) {
+ checkCyclicIteration(n);
+ }
+ }
+
+ private static void checkCyclicIteration(int numOfElements) {
+ //create a tree map
+ final NavigableMap map = new TreeMap();
+ final Integer[] integers = new Integer[numOfElements];
+ for(int i = 0; i < integers.length; i++) {
+ integers[i] = 2*i;
+ map.put(integers[i], integers[i]);
+ }
+ System.out.println("\n\nintegers=" + Arrays.asList(integers));
+ System.out.println("map=" + map);
+
+ //try starting everywhere
+ for(int start = -1; start <= 2*integers.length - 1; start++) {
+ //get a cyclic iteration
+ final List iteration = new ArrayList();
+ for(Map.Entry e : new CyclicIteration(map, start)) {
+ iteration.add(e.getKey());
+ }
+ System.out.println("start=" + start + ", iteration=" + iteration);
+
+ //verify results
+ for(int i = 0; i < integers.length; i++) {
+ final int j = ((start+2)/2 + i)%integers.length;
+ assertEquals("i=" + i + ", j=" + j, iteration.get(i), integers[j]);
+ }
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/util/TestGenericsUtil.java b/src/test/org/apache/hadoop/util/TestGenericsUtil.java
new file mode 100644
index 00000000000..af494c909d1
--- /dev/null
+++ b/src/test/org/apache/hadoop/util/TestGenericsUtil.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+
+public class TestGenericsUtil extends TestCase {
+
+ public void testToArray() {
+
+ //test a list of size 10
+ List list = new ArrayList();
+
+ for(int i=0; i<10; i++) {
+ list.add(i);
+ }
+
+ Integer[] arr = GenericsUtil.toArray(list);
+
+ for (int i = 0; i < arr.length; i++) {
+ assertEquals(list.get(i), arr[i]);
+ }
+ }
+
+ public void testWithEmptyList() {
+ try {
+ List list = new ArrayList();
+ String[] arr = GenericsUtil.toArray(list);
+ fail("Empty array should throw exception");
+ System.out.println(arr); //use arr so that compiler will not complain
+
+ }catch (IndexOutOfBoundsException ex) {
+ //test case is successful
+ }
+ }
+
+ public void testWithEmptyList2() {
+ List list = new ArrayList();
+ //this method should not throw IndexOutOfBoundsException
+ String[] arr = GenericsUtil.toArray(String.class, list);
+
+ assertEquals(0, arr.length);
+ }
+
+ /** This class uses generics */
+ private class GenericClass {
+ T dummy;
+ List list = new ArrayList();
+
+ void add(T item) {
+ list.add(item);
+ }
+
+ T[] funcThatUsesToArray() {
+ T[] arr = GenericsUtil.toArray(list);
+ return arr;
+ }
+ }
+
+ public void testWithGenericClass() {
+
+ GenericClass testSubject = new GenericClass();
+
+ testSubject.add("test1");
+ testSubject.add("test2");
+
+ try {
+ //this cast would fail, if we had not used GenericsUtil.toArray, since the
+ //rmethod would return Object[] rather than String[]
+ String[] arr = testSubject.funcThatUsesToArray();
+
+ assertEquals("test1", arr[0]);
+ assertEquals("test2", arr[1]);
+
+ }catch (ClassCastException ex) {
+ fail("GenericsUtil#toArray() is not working for generic classes");
+ }
+
+ }
+
+ public void testGenericOptionsParser() throws Exception {
+ GenericOptionsParser parser = new GenericOptionsParser(
+ new Configuration(), new String[] {"-jt"});
+ assertEquals(parser.getRemainingArgs().length, 0);
+ }
+
+ public void testGetClass() {
+
+ //test with Integer
+ Integer x = new Integer(42);
+ Class c = GenericsUtil.getClass(x);
+ assertEquals(Integer.class, c);
+
+ //test with GenericClass
+ GenericClass testSubject = new GenericClass();
+ Class> c2 = GenericsUtil.getClass(testSubject);
+ assertEquals(GenericClass.class, c2);
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/util/TestIndexedSort.java b/src/test/org/apache/hadoop/util/TestIndexedSort.java
new file mode 100644
index 00000000000..d806a0adce9
--- /dev/null
+++ b/src/test/org/apache/hadoop/util/TestIndexedSort.java
@@ -0,0 +1,361 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.util;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparator;
+
+public class TestIndexedSort extends TestCase {
+
+ public void sortAllEqual(IndexedSorter sorter) throws Exception {
+ final int SAMPLE = 500;
+ int[] values = new int[SAMPLE];
+ Arrays.fill(values, 10);
+ SampleSortable s = new SampleSortable(values);
+ sorter.sort(s, 0, SAMPLE);
+ int[] check = s.getSorted();
+ assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+ Arrays.toString(check), Arrays.equals(values, check));
+ // Set random min/max, re-sort.
+ Random r = new Random();
+ int min = r.nextInt(SAMPLE);
+ int max = (min + 1 + r.nextInt(SAMPLE - 2)) % SAMPLE;
+ values[min] = 9;
+ values[max] = 11;
+ System.out.println("testAllEqual setting min/max at " + min + "/" + max +
+ "(" + sorter.getClass().getName() + ")");
+ s = new SampleSortable(values);
+ sorter.sort(s, 0, SAMPLE);
+ check = s.getSorted();
+ Arrays.sort(values);
+ assertTrue(check[0] == 9);
+ assertTrue(check[SAMPLE - 1] == 11);
+ assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+ Arrays.toString(check), Arrays.equals(values, check));
+ }
+
+ public void sortSorted(IndexedSorter sorter) throws Exception {
+ final int SAMPLE = 500;
+ int[] values = new int[SAMPLE];
+ Random r = new Random();
+ long seed = r.nextLong();
+ r.setSeed(seed);
+ System.out.println("testSorted seed: " + seed +
+ "(" + sorter.getClass().getName() + ")");
+ for (int i = 0; i < SAMPLE; ++i) {
+ values[i] = r.nextInt(100);
+ }
+ Arrays.sort(values);
+ SampleSortable s = new SampleSortable(values);
+ sorter.sort(s, 0, SAMPLE);
+ int[] check = s.getSorted();
+ assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+ Arrays.toString(check), Arrays.equals(values, check));
+ }
+
+ public void sortSequential(IndexedSorter sorter) throws Exception {
+ final int SAMPLE = 500;
+ int[] values = new int[SAMPLE];
+ for (int i = 0; i < SAMPLE; ++i) {
+ values[i] = i;
+ }
+ SampleSortable s = new SampleSortable(values);
+ sorter.sort(s, 0, SAMPLE);
+ int[] check = s.getSorted();
+ assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+ Arrays.toString(check), Arrays.equals(values, check));
+ }
+
+ public void sortSingleRecord(IndexedSorter sorter) throws Exception {
+ final int SAMPLE = 1;
+ SampleSortable s = new SampleSortable(SAMPLE);
+ int[] values = s.getValues();
+ sorter.sort(s, 0, SAMPLE);
+ int[] check = s.getSorted();
+ assertTrue(Arrays.toString(values) + "\ndoesn't match\n" +
+ Arrays.toString(check), Arrays.equals(values, check));
+ }
+
+ public void sortRandom(IndexedSorter sorter) throws Exception {
+ final int SAMPLE = 256 * 1024;
+ SampleSortable s = new SampleSortable(SAMPLE);
+ long seed = s.getSeed();
+ System.out.println("sortRandom seed: " + seed +
+ "(" + sorter.getClass().getName() + ")");
+ int[] values = s.getValues();
+ Arrays.sort(values);
+ sorter.sort(s, 0, SAMPLE);
+ int[] check = s.getSorted();
+ assertTrue("seed: " + seed + "\ndoesn't match\n",
+ Arrays.equals(values, check));
+ }
+
+ public void sortWritable(IndexedSorter sorter) throws Exception {
+ final int SAMPLE = 1000;
+ WritableSortable s = new WritableSortable(SAMPLE);
+ long seed = s.getSeed();
+ System.out.println("sortWritable seed: " + seed +
+ "(" + sorter.getClass().getName() + ")");
+ String[] values = s.getValues();
+ Arrays.sort(values);
+ sorter.sort(s, 0, SAMPLE);
+ String[] check = s.getSorted();
+ assertTrue("seed: " + seed + "\ndoesn't match",
+ Arrays.equals(values, check));
+ }
+
+
+ public void testQuickSort() throws Exception {
+ QuickSort sorter = new QuickSort();
+ sortRandom(sorter);
+ sortSingleRecord(sorter);
+ sortSequential(sorter);
+ sortSorted(sorter);
+ sortAllEqual(sorter);
+ sortWritable(sorter);
+
+ // test degenerate case for median-of-three partitioning
+ // a_n, a_1, a_2, ..., a_{n-1}
+ final int DSAMPLE = 500;
+ int[] values = new int[DSAMPLE];
+ for (int i = 0; i < DSAMPLE; ++i) { values[i] = i; }
+ values[0] = values[DSAMPLE - 1] + 1;
+ SampleSortable s = new SampleSortable(values);
+ values = s.getValues();
+ final int DSS = (DSAMPLE / 2) * (DSAMPLE / 2);
+ // Worst case is (N/2)^2 comparisons, not including those effecting
+ // the median-of-three partitioning; impl should handle this case
+ MeasuredSortable m = new MeasuredSortable(s, DSS);
+ sorter.sort(m, 0, DSAMPLE);
+ System.out.println("QuickSort degen cmp/swp: " +
+ m.getCmp() + "/" + m.getSwp() +
+ "(" + sorter.getClass().getName() + ")");
+ Arrays.sort(values);
+ int[] check = s.getSorted();
+ assertTrue(Arrays.equals(values, check));
+ }
+
+ public void testHeapSort() throws Exception {
+ HeapSort sorter = new HeapSort();
+ sortRandom(sorter);
+ sortSingleRecord(sorter);
+ sortSequential(sorter);
+ sortSorted(sorter);
+ sortAllEqual(sorter);
+ sortWritable(sorter);
+ }
+
+ // Sortables //
+
+ private static class SampleSortable implements IndexedSortable {
+ private int[] valindex;
+ private int[] valindirect;
+ private int[] values;
+ private final long seed;
+
+ public SampleSortable() {
+ this(50);
+ }
+
+ public SampleSortable(int j) {
+ Random r = new Random();
+ seed = r.nextLong();
+ r.setSeed(seed);
+ values = new int[j];
+ valindex = new int[j];
+ valindirect = new int[j];
+ for (int i = 0; i < j; ++i) {
+ valindex[i] = valindirect[i] = i;
+ values[i] = r.nextInt(1000);
+ }
+ }
+
+ public SampleSortable(int[] values) {
+ this.values = values;
+ valindex = new int[values.length];
+ valindirect = new int[values.length];
+ for (int i = 0; i < values.length; ++i) {
+ valindex[i] = valindirect[i] = i;
+ }
+ seed = 0;
+ }
+
+ public long getSeed() {
+ return seed;
+ }
+
+ public int compare(int i, int j) {
+ // assume positive
+ return
+ values[valindirect[valindex[i]]] - values[valindirect[valindex[j]]];
+ }
+
+ public void swap(int i, int j) {
+ int tmp = valindex[i];
+ valindex[i] = valindex[j];
+ valindex[j] = tmp;
+ }
+
+ public int[] getSorted() {
+ int[] ret = new int[values.length];
+ for (int i = 0; i < ret.length; ++i) {
+ ret[i] = values[valindirect[valindex[i]]];
+ }
+ return ret;
+ }
+
+ public int[] getValues() {
+ int[] ret = new int[values.length];
+ System.arraycopy(values, 0, ret, 0, values.length);
+ return ret;
+ }
+
+ }
+
+ public static class MeasuredSortable implements IndexedSortable {
+
+ private int comparisions;
+ private int swaps;
+ private final int maxcmp;
+ private final int maxswp;
+ private IndexedSortable s;
+
+ public MeasuredSortable(IndexedSortable s) {
+ this(s, Integer.MAX_VALUE);
+ }
+
+ public MeasuredSortable(IndexedSortable s, int maxcmp) {
+ this(s, maxcmp, Integer.MAX_VALUE);
+ }
+
+ public MeasuredSortable(IndexedSortable s, int maxcmp, int maxswp) {
+ this.s = s;
+ this.maxcmp = maxcmp;
+ this.maxswp = maxswp;
+ }
+
+ public int getCmp() { return comparisions; }
+ public int getSwp() { return swaps; }
+
+ public int compare(int i, int j) {
+ assertTrue("Expected fewer than " + maxcmp + " comparisons",
+ ++comparisions < maxcmp);
+ return s.compare(i, j);
+ }
+
+ public void swap(int i, int j) {
+ assertTrue("Expected fewer than " + maxswp + " swaps",
+ ++swaps < maxswp);
+ s.swap(i, j);
+ }
+
+ }
+
+ private static class WritableSortable implements IndexedSortable {
+
+ private static Random r = new Random();
+ private final int eob;
+ private final int[] indices;
+ private final int[] offsets;
+ private final byte[] bytes;
+ private final WritableComparator comparator;
+ private final String[] check;
+ private final long seed;
+
+ public WritableSortable() throws IOException {
+ this(100);
+ }
+
+ public WritableSortable(int j) throws IOException {
+ seed = r.nextLong();
+ r.setSeed(seed);
+ Text t = new Text();
+ StringBuffer sb = new StringBuffer();
+ indices = new int[j];
+ offsets = new int[j];
+ check = new String[j];
+ DataOutputBuffer dob = new DataOutputBuffer();
+ for (int i = 0; i < j; ++i) {
+ indices[i] = i;
+ offsets[i] = dob.getLength();
+ genRandom(t, r.nextInt(15) + 1, sb);
+ t.write(dob);
+ check[i] = t.toString();
+ }
+ eob = dob.getLength();
+ bytes = dob.getData();
+ comparator = WritableComparator.get(Text.class);
+ }
+
+ public long getSeed() {
+ return seed;
+ }
+
+ private static void genRandom(Text t, int len, StringBuffer sb) {
+ sb.setLength(0);
+ for (int i = 0; i < len; ++i) {
+ sb.append(Integer.toString(r.nextInt(26) + 10, 36));
+ }
+ t.set(sb.toString());
+ }
+
+ public int compare(int i, int j) {
+ final int ii = indices[i];
+ final int ij = indices[j];
+ return comparator.compare(bytes, offsets[ii],
+ ((ii + 1 == indices.length) ? eob : offsets[ii + 1]) - offsets[ii],
+ bytes, offsets[ij],
+ ((ij + 1 == indices.length) ? eob : offsets[ij + 1]) - offsets[ij]);
+ }
+
+ public void swap(int i, int j) {
+ int tmp = indices[i];
+ indices[i] = indices[j];
+ indices[j] = tmp;
+ }
+
+ public String[] getValues() {
+ return check;
+ }
+
+ public String[] getSorted() throws IOException {
+ String[] ret = new String[indices.length];
+ Text t = new Text();
+ DataInputBuffer dib = new DataInputBuffer();
+ for (int i = 0; i < ret.length; ++i) {
+ int ii = indices[i];
+ dib.reset(bytes, offsets[ii],
+ ((ii + 1 == indices.length) ? eob : offsets[ii + 1]) - offsets[ii]);
+ t.readFields(dib);
+ ret[i] = t.toString();
+ }
+ return ret;
+ }
+
+ }
+
+}
diff --git a/src/test/org/apache/hadoop/util/TestProcfsBasedProcessTree.java b/src/test/org/apache/hadoop/util/TestProcfsBasedProcessTree.java
new file mode 100644
index 00000000000..0b975074026
--- /dev/null
+++ b/src/test/org/apache/hadoop/util/TestProcfsBasedProcessTree.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Random;
+import java.util.Vector;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Shell.ExitCodeException;
+import org.apache.hadoop.util.Shell.ShellCommandExecutor;
+
+import junit.framework.TestCase;
+
+/**
+ * A JUnit test to test ProcfsBasedProcessTree.
+ */
+public class TestProcfsBasedProcessTree extends TestCase {
+
+ private static final Log LOG = LogFactory
+ .getLog(TestProcfsBasedProcessTree.class);
+ private static String TEST_ROOT_DIR = new Path(System.getProperty(
+ "test.build.data", "/tmp")).toString().replace(' ', '+');
+
+ private ShellCommandExecutor shexec = null;
+ private String pidFile, lowestDescendant;
+ private String shellScript;
+ private static final int N = 6; // Controls the RogueTask
+
+ private class RogueTaskThread extends Thread {
+ public void run() {
+ try {
+ Vector args = new Vector();
+ if(ProcessTree.isSetsidAvailable) {
+ args.add("setsid");
+ }
+ args.add("bash");
+ args.add("-c");
+ args.add(" echo $$ > " + pidFile + "; sh " +
+ shellScript + " " + N + ";") ;
+ shexec = new ShellCommandExecutor(args.toArray(new String[0]));
+ shexec.execute();
+ } catch (ExitCodeException ee) {
+ LOG.info("Shell Command exit with a non-zero exit code. This is" +
+ " expected as we are killing the subprocesses of the" +
+ " task intentionally. " + ee);
+ } catch (IOException ioe) {
+ LOG.info("Error executing shell command " + ioe);
+ } finally {
+ LOG.info("Exit code: " + shexec.getExitCode());
+ }
+ }
+ }
+
+ private String getRogueTaskPID() {
+ File f = new File(pidFile);
+ while (!f.exists()) {
+ try {
+ Thread.sleep(500);
+ } catch (InterruptedException ie) {
+ break;
+ }
+ }
+
+ // read from pidFile
+ return getPidFromPidFile(pidFile);
+ }
+
+ public void testProcessTree() {
+
+ try {
+ if (!ProcfsBasedProcessTree.isAvailable()) {
+ System.out
+ .println("ProcfsBasedProcessTree is not available on this system. Not testing");
+ return;
+ }
+ } catch (Exception e) {
+ LOG.info(StringUtils.stringifyException(e));
+ return;
+ }
+ // create shell script
+ Random rm = new Random();
+ File tempFile = new File(TEST_ROOT_DIR, this.getName() + "_shellScript_" +
+ rm.nextInt() + ".sh");
+ tempFile.deleteOnExit();
+ shellScript = TEST_ROOT_DIR + File.separator + tempFile.getName();
+
+ // create pid file
+ tempFile = new File(TEST_ROOT_DIR, this.getName() + "_pidFile_" +
+ rm.nextInt() + ".pid");
+ tempFile.deleteOnExit();
+ pidFile = TEST_ROOT_DIR + File.separator + tempFile.getName();
+
+ lowestDescendant = TEST_ROOT_DIR + File.separator + "lowestDescendantPidFile";
+
+ // write to shell-script
+ try {
+ FileWriter fWriter = new FileWriter(shellScript);
+ fWriter.write(
+ "# rogue task\n" +
+ "sleep 1\n" +
+ "echo hello\n" +
+ "if [ $1 -ne 0 ]\n" +
+ "then\n" +
+ " sh " + shellScript + " $(($1-1))\n" +
+ "else\n" +
+ " echo $$ > " + lowestDescendant + "\n" +
+ " while true\n do\n" +
+ " sleep 5\n" +
+ " done\n" +
+ "fi");
+ fWriter.close();
+ } catch (IOException ioe) {
+ LOG.info("Error: " + ioe);
+ return;
+ }
+
+ Thread t = new RogueTaskThread();
+ t.start();
+ String pid = getRogueTaskPID();
+ LOG.info("Root process pid: " + pid);
+ ProcfsBasedProcessTree p = new ProcfsBasedProcessTree(pid,
+ ProcessTree.isSetsidAvailable,
+ ProcessTree.DEFAULT_SLEEPTIME_BEFORE_SIGKILL);
+ p = p.getProcessTree(); // initialize
+ LOG.info("ProcessTree: " + p.toString());
+
+ File leaf = new File(lowestDescendant);
+ //wait till lowest descendant process of Rougue Task starts execution
+ while (!leaf.exists()) {
+ try {
+ Thread.sleep(500);
+ } catch (InterruptedException ie) {
+ break;
+ }
+ }
+
+ p = p.getProcessTree(); // reconstruct
+ LOG.info("ProcessTree: " + p.toString());
+
+ // destroy the map task and all its subprocesses
+ p.destroy(true/*in the background*/);
+
+ if(ProcessTree.isSetsidAvailable) {// whole processtree should be gone
+ assertEquals(false, p.isAnyProcessInTreeAlive());
+ }
+ else {// process should be gone
+ assertFalse("ProcessTree must have been gone", p.isAlive());
+ }
+ // Not able to join thread sometimes when forking with large N.
+ try {
+ t.join(2000);
+ LOG.info("RogueTaskThread successfully joined.");
+ } catch (InterruptedException ie) {
+ LOG.info("Interrupted while joining RogueTaskThread.");
+ }
+
+ // ProcessTree is gone now. Any further calls should be sane.
+ p = p.getProcessTree();
+ assertFalse("ProcessTree must have been gone", p.isAlive());
+ assertTrue("Cumulative vmem for the gone-process is "
+ + p.getCumulativeVmem() + " . It should be zero.", p
+ .getCumulativeVmem() == 0);
+ assertTrue(p.toString().equals("[ ]"));
+ }
+
+ /**
+ * Get PID from a pid-file.
+ *
+ * @param pidFileName
+ * Name of the pid-file.
+ * @return the PID string read from the pid-file. Returns null if the
+ * pidFileName points to a non-existing file or if read fails from the
+ * file.
+ */
+ public static String getPidFromPidFile(String pidFileName) {
+ BufferedReader pidFile = null;
+ FileReader fReader = null;
+ String pid = null;
+
+ try {
+ fReader = new FileReader(pidFileName);
+ pidFile = new BufferedReader(fReader);
+ } catch (FileNotFoundException f) {
+ LOG.debug("PidFile doesn't exist : " + pidFileName);
+ return pid;
+ }
+
+ try {
+ pid = pidFile.readLine();
+ } catch (IOException i) {
+ LOG.error("Failed to read from " + pidFileName);
+ } finally {
+ try {
+ if (fReader != null) {
+ fReader.close();
+ }
+ try {
+ if (pidFile != null) {
+ pidFile.close();
+ }
+ } catch (IOException i) {
+ LOG.warn("Error closing the stream " + pidFile);
+ }
+ } catch (IOException i) {
+ LOG.warn("Error closing the stream " + fReader);
+ }
+ }
+ return pid;
+ }
+}
diff --git a/src/test/org/apache/hadoop/util/TestShell.java b/src/test/org/apache/hadoop/util/TestShell.java
new file mode 100644
index 00000000000..ca7303187bc
--- /dev/null
+++ b/src/test/org/apache/hadoop/util/TestShell.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.util;
+
+import junit.framework.TestCase;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+
+public class TestShell extends TestCase {
+
+ private static class Command extends Shell {
+ private int runCount = 0;
+
+ private Command(long interval) {
+ super(interval);
+ }
+
+ protected String[] getExecString() {
+ return new String[] {"echo", "hello"};
+ }
+
+ protected void parseExecResult(BufferedReader lines) throws IOException {
+ ++runCount;
+ }
+
+ public int getRunCount() {
+ return runCount;
+ }
+ }
+
+ public void testInterval() throws IOException {
+ testInterval(Long.MIN_VALUE / 60000); // test a negative interval
+ testInterval(0L); // test a zero interval
+ testInterval(10L); // interval equal to 10mins
+ testInterval(System.currentTimeMillis() / 60000 + 60); // test a very big interval
+ }
+
+ /**
+ * Assert that a string has a substring in it
+ * @param string string to search
+ * @param search what to search for it
+ */
+ private void assertInString(String string, String search) {
+ assertNotNull("Empty String", string);
+ if (!string.contains(search)) {
+ fail("Did not find \"" + search + "\" in " + string);
+ }
+ }
+
+ public void testShellCommandExecutorToString() throws Throwable {
+ Shell.ShellCommandExecutor sce=new Shell.ShellCommandExecutor(
+ new String[] { "ls","..","arg 2"});
+ String command = sce.toString();
+ assertInString(command,"ls");
+ assertInString(command, " .. ");
+ assertInString(command, "\"arg 2\"");
+ }
+
+ private void testInterval(long interval) throws IOException {
+ Command command = new Command(interval);
+
+ command.run();
+ assertEquals(1, command.getRunCount());
+
+ command.run();
+ if (interval > 0) {
+ assertEquals(1, command.getRunCount());
+ } else {
+ assertEquals(2, command.getRunCount());
+ }
+ }
+}
diff --git a/src/test/org/apache/hadoop/util/TestStringUtils.java b/src/test/org/apache/hadoop/util/TestStringUtils.java
new file mode 100644
index 00000000000..e68609ae2ff
--- /dev/null
+++ b/src/test/org/apache/hadoop/util/TestStringUtils.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util;
+
+import junit.framework.TestCase;
+
+public class TestStringUtils extends TestCase {
+ final private static String NULL_STR = null;
+ final private static String EMPTY_STR = "";
+ final private static String STR_WO_SPECIAL_CHARS = "AB";
+ final private static String STR_WITH_COMMA = "A,B";
+ final private static String ESCAPED_STR_WITH_COMMA = "A\\,B";
+ final private static String STR_WITH_ESCAPE = "AB\\";
+ final private static String ESCAPED_STR_WITH_ESCAPE = "AB\\\\";
+ final private static String STR_WITH_BOTH2 = ",A\\,,B\\\\,";
+ final private static String ESCAPED_STR_WITH_BOTH2 =
+ "\\,A\\\\\\,\\,B\\\\\\\\\\,";
+
+ public void testEscapeString() throws Exception {
+ assertEquals(NULL_STR, StringUtils.escapeString(NULL_STR));
+ assertEquals(EMPTY_STR, StringUtils.escapeString(EMPTY_STR));
+ assertEquals(STR_WO_SPECIAL_CHARS,
+ StringUtils.escapeString(STR_WO_SPECIAL_CHARS));
+ assertEquals(ESCAPED_STR_WITH_COMMA,
+ StringUtils.escapeString(STR_WITH_COMMA));
+ assertEquals(ESCAPED_STR_WITH_ESCAPE,
+ StringUtils.escapeString(STR_WITH_ESCAPE));
+ assertEquals(ESCAPED_STR_WITH_BOTH2,
+ StringUtils.escapeString(STR_WITH_BOTH2));
+ }
+
+ public void testSplit() throws Exception {
+ assertEquals(NULL_STR, StringUtils.split(NULL_STR));
+ String[] splits = StringUtils.split(EMPTY_STR);
+ assertEquals(0, splits.length);
+ splits = StringUtils.split(",,");
+ assertEquals(0, splits.length);
+ splits = StringUtils.split(STR_WO_SPECIAL_CHARS);
+ assertEquals(1, splits.length);
+ assertEquals(STR_WO_SPECIAL_CHARS, splits[0]);
+ splits = StringUtils.split(STR_WITH_COMMA);
+ assertEquals(2, splits.length);
+ assertEquals("A", splits[0]);
+ assertEquals("B", splits[1]);
+ splits = StringUtils.split(ESCAPED_STR_WITH_COMMA);
+ assertEquals(1, splits.length);
+ assertEquals(ESCAPED_STR_WITH_COMMA, splits[0]);
+ splits = StringUtils.split(STR_WITH_ESCAPE);
+ assertEquals(1, splits.length);
+ assertEquals(STR_WITH_ESCAPE, splits[0]);
+ splits = StringUtils.split(STR_WITH_BOTH2);
+ assertEquals(3, splits.length);
+ assertEquals(EMPTY_STR, splits[0]);
+ assertEquals("A\\,", splits[1]);
+ assertEquals("B\\\\", splits[2]);
+ splits = StringUtils.split(ESCAPED_STR_WITH_BOTH2);
+ assertEquals(1, splits.length);
+ assertEquals(ESCAPED_STR_WITH_BOTH2, splits[0]);
+ }
+
+ public void testUnescapeString() throws Exception {
+ assertEquals(NULL_STR, StringUtils.unEscapeString(NULL_STR));
+ assertEquals(EMPTY_STR, StringUtils.unEscapeString(EMPTY_STR));
+ assertEquals(STR_WO_SPECIAL_CHARS,
+ StringUtils.unEscapeString(STR_WO_SPECIAL_CHARS));
+ try {
+ StringUtils.unEscapeString(STR_WITH_COMMA);
+ fail("Should throw IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ assertEquals(STR_WITH_COMMA,
+ StringUtils.unEscapeString(ESCAPED_STR_WITH_COMMA));
+ try {
+ StringUtils.unEscapeString(STR_WITH_ESCAPE);
+ fail("Should throw IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ assertEquals(STR_WITH_ESCAPE,
+ StringUtils.unEscapeString(ESCAPED_STR_WITH_ESCAPE));
+ try {
+ StringUtils.unEscapeString(STR_WITH_BOTH2);
+ fail("Should throw IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ assertEquals(STR_WITH_BOTH2,
+ StringUtils.unEscapeString(ESCAPED_STR_WITH_BOTH2));
+ }
+
+ public void testTraditionalBinaryPrefix() throws Exception {
+ String[] symbol = {"k", "m", "g", "t", "p", "e"};
+ long m = 1024;
+ for(String s : symbol) {
+ assertEquals(0, StringUtils.TraditionalBinaryPrefix.string2long(0 + s));
+ assertEquals(m, StringUtils.TraditionalBinaryPrefix.string2long(1 + s));
+ m *= 1024;
+ }
+
+ assertEquals(0L, StringUtils.TraditionalBinaryPrefix.string2long("0"));
+ assertEquals(-1259520L, StringUtils.TraditionalBinaryPrefix.string2long("-1230k"));
+ assertEquals(956703965184L, StringUtils.TraditionalBinaryPrefix.string2long("891g"));
+ }
+}