MAPREDUCE-5965. Hadoop streaming throws error if list of input files is high. Error is: "error=7, Argument list too long at if number of input file is high" (wilfreds via rkanter)
This commit is contained in:
parent
5766a04428
commit
cc70df98e7
|
@ -443,6 +443,10 @@ Release 2.8.0 - UNRELEASED
|
||||||
MAPREDUCE-6374. Distributed Cache File visibility should check permission
|
MAPREDUCE-6374. Distributed Cache File visibility should check permission
|
||||||
of full path (Chang Li via jlowe)
|
of full path (Chang Li via jlowe)
|
||||||
|
|
||||||
|
MAPREDUCE-5965. Hadoop streaming throws error if list of input files is
|
||||||
|
high. Error is: "error=7, Argument list too long at if number of input
|
||||||
|
file is high" (wilfreds via rkanter)
|
||||||
|
|
||||||
Release 2.7.1 - UNRELEASED
|
Release 2.7.1 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -19,8 +19,7 @@
|
||||||
package org.apache.hadoop.streaming;
|
package org.apache.hadoop.streaming;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.Map;
|
import java.util.Map.Entry;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
|
@ -238,13 +237,17 @@ public abstract class PipeMapRed {
|
||||||
void addJobConfToEnvironment(JobConf jobconf, Properties env) {
|
void addJobConfToEnvironment(JobConf jobconf, Properties env) {
|
||||||
JobConf conf = new JobConf(jobconf);
|
JobConf conf = new JobConf(jobconf);
|
||||||
conf.setDeprecatedProperties();
|
conf.setDeprecatedProperties();
|
||||||
Iterator it = conf.iterator();
|
int lenLimit = conf.getInt("stream.jobconf.truncate.limit", -1);
|
||||||
while (it.hasNext()) {
|
|
||||||
Map.Entry en = (Map.Entry) it.next();
|
for (Entry<String, String> confEntry: conf) {
|
||||||
String name = (String) en.getKey();
|
String name = confEntry.getKey();
|
||||||
//String value = (String)en.getValue(); // does not apply variable expansion
|
String value = conf.get(name); // does variable expansion
|
||||||
String value = conf.get(name); // does variable expansion
|
|
||||||
name = safeEnvVarName(name);
|
name = safeEnvVarName(name);
|
||||||
|
if (lenLimit > -1 && value.length() > lenLimit) {
|
||||||
|
LOG.warn("Environment variable " + name + " truncated to " + lenLimit
|
||||||
|
+ " to fit system limits.");
|
||||||
|
value = value.substring(0, lenLimit);
|
||||||
|
}
|
||||||
envPut(env, name, value);
|
envPut(env, name, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -617,7 +617,10 @@ public class StreamJob implements Tool {
|
||||||
"/path/my-hadoop-streaming.jar");
|
"/path/my-hadoop-streaming.jar");
|
||||||
System.out.println("For more details about jobconf parameters see:");
|
System.out.println("For more details about jobconf parameters see:");
|
||||||
System.out.println(" http://wiki.apache.org/hadoop/JobConfFile");
|
System.out.println(" http://wiki.apache.org/hadoop/JobConfFile");
|
||||||
System.out.println("To set an environement variable in a streaming " +
|
System.out.println("Truncate the values of the job configuration copied" +
|
||||||
|
"to the environment at the given length:");
|
||||||
|
System.out.println(" -D stream.jobconf.truncate.limit=-1");
|
||||||
|
System.out.println("To set an environment variable in a streaming " +
|
||||||
"command:");
|
"command:");
|
||||||
System.out.println(" -cmdenv EXAMPLE_DIR=/home/example/dictionaries/");
|
System.out.println(" -cmdenv EXAMPLE_DIR=/home/example/dictionaries/");
|
||||||
System.out.println();
|
System.out.println();
|
||||||
|
|
|
@ -55,6 +55,7 @@ Hadoop Streaming
|
||||||
* [How do I update counters in streaming applications?](#How_do_I_update_counters_in_streaming_applications)
|
* [How do I update counters in streaming applications?](#How_do_I_update_counters_in_streaming_applications)
|
||||||
* [How do I update status in streaming applications?](#How_do_I_update_status_in_streaming_applications)
|
* [How do I update status in streaming applications?](#How_do_I_update_status_in_streaming_applications)
|
||||||
* [How do I get the Job variables in a streaming job's mapper/reducer?](#How_do_I_get_the_Job_variables_in_a_streaming_jobs_mapperreducer)
|
* [How do I get the Job variables in a streaming job's mapper/reducer?](#How_do_I_get_the_Job_variables_in_a_streaming_jobs_mapperreducer)
|
||||||
|
* [What do I do if I get a "error=7, Argument list too long"](#What_do_I_do_if_I_get_a_error_Argument_list_too_long)
|
||||||
|
|
||||||
Hadoop Streaming
|
Hadoop Streaming
|
||||||
----------------
|
----------------
|
||||||
|
@ -564,3 +565,11 @@ A streaming process can use the stderr to emit status information. To set a stat
|
||||||
$H3 How do I get the Job variables in a streaming job's mapper/reducer?
|
$H3 How do I get the Job variables in a streaming job's mapper/reducer?
|
||||||
|
|
||||||
See [Configured Parameters](../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html#Configured_Parameters). During the execution of a streaming job, the names of the "mapred" parameters are transformed. The dots ( . ) become underscores ( \_ ). For example, mapreduce.job.id becomes mapreduce\_job\_id and mapreduce.job.jar becomes mapreduce\_job\_jar. In your code, use the parameter names with the underscores.
|
See [Configured Parameters](../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html#Configured_Parameters). During the execution of a streaming job, the names of the "mapred" parameters are transformed. The dots ( . ) become underscores ( \_ ). For example, mapreduce.job.id becomes mapreduce\_job\_id and mapreduce.job.jar becomes mapreduce\_job\_jar. In your code, use the parameter names with the underscores.
|
||||||
|
|
||||||
|
$H3 What do I do if I get a "error=7, Argument list too long"
|
||||||
|
|
||||||
|
The job copies the whole configuration to the environment. If the job is processing a large number of input files adding the job configuration to the environment could cause an overrun of the environment. The job configuration copy in the environment is not essential for running the job and can be truncated by setting:
|
||||||
|
|
||||||
|
-D stream.jobconf.truncate.limit=20000
|
||||||
|
|
||||||
|
By default the values are not truncated (-1). Zero (0) will only copy the names and not values. For almost all cases 20000 is a safe value that will prevent the overrun of the environment.
|
||||||
|
|
Loading…
Reference in New Issue