From 71ac65859ebf0934f78df1b839168b8be57c6858 Mon Sep 17 00:00:00 2001 From: Alejandro Abdelnur Date: Fri, 20 Jan 2012 18:55:20 +0000 Subject: [PATCH] Merge -r 1215140:1215141 from trunk to branch. FIXES: MAPREDUCE-778 git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23@1234070 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-mapreduce-project/CHANGES.txt | 5 +- hadoop-mapreduce-project/ivy.xml | 7 + .../ivy/libraries.properties | 5 +- .../hadoop/mapred/gridmix/GridmixJob.java | 17 +- .../src/documentation/content/xdocs/rumen.xml | 172 +- .../tools/rumen/TestRumenAnonymization.java | 1940 +++++++++++++++++ .../hadoop/tools/rumen/TestRumenFolder.java | 4 - .../tools/rumen/TestRumenJobTraces.java | 4 +- .../apache/hadoop/tools/rumen/Anonymizer.java | 273 +++ .../org/apache/hadoop/tools/rumen/Folder.java | 79 +- .../tools/rumen/HadoopLogsAnalyzer.java | 45 +- .../apache/hadoop/tools/rumen/JobBuilder.java | 25 +- .../tools/rumen/JsonObjectMapperWriter.java | 19 + .../apache/hadoop/tools/rumen/LoggedJob.java | 78 +- .../hadoop/tools/rumen/LoggedLocation.java | 47 +- .../tools/rumen/LoggedNetworkTopology.java | 19 +- .../apache/hadoop/tools/rumen/LoggedTask.java | 11 +- .../hadoop/tools/rumen/LoggedTaskAttempt.java | 47 +- .../apache/hadoop/tools/rumen/ParsedHost.java | 14 +- .../hadoop/tools/rumen/ZombieCluster.java | 7 +- .../apache/hadoop/tools/rumen/ZombieJob.java | 52 +- .../rumen/anonymization/DataAnonymizer.java | 27 + .../tools/rumen/anonymization/WordList.java | 106 + .../WordListAnonymizerUtility.java | 110 + .../rumen/datatypes/AnonymizableDataType.java | 28 + .../tools/rumen/datatypes/ClassName.java | 57 + .../tools/rumen/datatypes/DataType.java | 25 + .../DefaultAnonymizableDataType.java | 67 + .../rumen/datatypes/DefaultDataType.java | 37 + .../tools/rumen/datatypes/FileName.java | 213 ++ .../hadoop/tools/rumen/datatypes/JobName.java | 41 + .../tools/rumen/datatypes/JobProperties.java | 93 + .../tools/rumen/datatypes/NodeName.java | 185 ++ .../tools/rumen/datatypes/QueueName.java | 41 + .../tools/rumen/datatypes/UserName.java | 40 + .../util/DefaultJobPropertiesParser.java | 31 + .../datatypes/util/JobPropertyParser.java | 34 + .../util/MapReduceJobPropertiesParser.java | 227 ++ .../rumen/serializers/BlockingSerializer.java | 36 + .../DefaultAnonymizingRumenSerializer.java | 57 + .../serializers/DefaultRumenSerializer.java | 42 + .../serializers/ObjectStringSerializer.java | 35 + .../hadoop/tools/rumen/state/State.java | 46 + .../tools/rumen/state/StateDeserializer.java | 59 + .../hadoop/tools/rumen/state/StatePool.java | 345 +++ 45 files changed, 4603 insertions(+), 249 deletions(-) create mode 100644 hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/tools/rumen/TestRumenAnonymization.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/Anonymizer.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/anonymization/DataAnonymizer.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/anonymization/WordList.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/anonymization/WordListAnonymizerUtility.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/AnonymizableDataType.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/ClassName.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/DataType.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/DefaultAnonymizableDataType.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/DefaultDataType.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/FileName.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/JobName.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/JobProperties.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/NodeName.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/QueueName.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/UserName.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/util/DefaultJobPropertiesParser.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/util/JobPropertyParser.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/util/MapReduceJobPropertiesParser.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/BlockingSerializer.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/DefaultAnonymizingRumenSerializer.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/DefaultRumenSerializer.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/ObjectStringSerializer.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/state/State.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/state/StateDeserializer.java create mode 100644 hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/state/StatePool.java diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 764975f0554..b339d542c40 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -4,7 +4,8 @@ Release 0.23.1 - Unreleased INCOMPATIBLE CHANGES - NEW FEATURES + NEW FEATURES + MAPREDUCE-778. Rumen Anonymizer. (Amar Kamat and Chris Douglas via amarrk) MAPREDUCE-3121. NodeManager should handle disk-failures (Ravi Gummadi via mahadev) @@ -14,6 +15,8 @@ Release 0.23.1 - Unreleased MAPREDUCE-3251. Network ACLs can prevent some clients to talk to MR ApplicationMaster. (Anupam Seth via mahadev) + MAPREDUCE-778. Rumen Anonymizer. (Amar Kamat and Chris Douglas via amarrk) + IMPROVEMENTS MAPREDUCE-3375. [Gridmix] Memory Emulation system tests. (Vinay Thota via amarrk) diff --git a/hadoop-mapreduce-project/ivy.xml b/hadoop-mapreduce-project/ivy.xml index e9b38d077eb..e04da7019bb 100644 --- a/hadoop-mapreduce-project/ivy.xml +++ b/hadoop-mapreduce-project/ivy.xml @@ -139,6 +139,13 @@ + + + + + diff --git a/hadoop-mapreduce-project/ivy/libraries.properties b/hadoop-mapreduce-project/ivy/libraries.properties index 06ed6d98f65..76d05e295df 100644 --- a/hadoop-mapreduce-project/ivy/libraries.properties +++ b/hadoop-mapreduce-project/ivy/libraries.properties @@ -81,5 +81,6 @@ wagon-http.version=1.0-beta-2 xmlenc.version=0.52 xerces.version=1.4.4 -yarn.version=0.23.1-SNAPSHOT -hadoop-mapreduce.version=0.23.1-SNAPSHOT +jackson.version=1.8.2 +yarn.version=0.24.0-SNAPSHOT +hadoop-mapreduce.version=0.24.0-SNAPSHOT diff --git a/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java b/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java index 9b6ed69f575..77ec697872f 100644 --- a/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java +++ b/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java @@ -26,8 +26,6 @@ import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Delayed; import java.util.concurrent.TimeUnit; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.security.PrivilegedExceptionAction; import org.apache.hadoop.conf.Configuration; @@ -49,6 +47,7 @@ import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.tools.rumen.JobStory; +import static org.apache.hadoop.tools.rumen.datatypes.util.MapReduceJobPropertiesParser.extractMaxHeapOpts; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -92,8 +91,6 @@ abstract class GridmixJob implements Callable, Delayed { // configuration key to enable/disable task jvm options static final String GRIDMIX_TASK_JVM_OPTIONS_ENABLE = "gridmix.task.jvm-options.enable"; - private static final Pattern maxHeapPattern = - Pattern.compile("-Xmx[0-9]+[kKmMgGtT]?+"); private static void setJobQueue(Job job, String queue) { if (queue != null) { @@ -225,18 +222,6 @@ abstract class GridmixJob implements Callable, Delayed { } } } - - private static void extractMaxHeapOpts(String javaOptions, - List maxOpts, List others) { - for (String opt : javaOptions.split(" ")) { - Matcher matcher = maxHeapPattern.matcher(opt); - if (matcher.find()) { - maxOpts.add(opt); - } else { - others.add(opt); - } - } - } // Scales the desired job-level configuration parameter. This API makes sure // that the ratio of the job level configuration parameter to the cluster diff --git a/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml b/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml index 75b97ac5e8a..dbe72c56ca7 100644 --- a/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml +++ b/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml @@ -73,6 +73,11 @@ computed for the total number of successful tasks for every attempt. +
  • Anonymized traces enables sharing of production traces of large + scale Hadoop deployments. Sharing of traces will foster + collaboration within the Hadoop community. It can also be used to + supplement interesting research findings. +
  • @@ -102,6 +107,11 @@ Increasing the trace runtime might involve adding some dummy jobs to the resulting trace and scaling up the runtime of individual jobs. +
  • Anonymizer : + A utility to anonymize Hadoop job and cluster topology traces by + masking certain sensitive fields but retaining important workload + characteristics. +
  • @@ -128,10 +138,11 @@ output-duration, concentration etc. -

    Rumen provides 2 basic commands

    +

    Rumen provides 3 basic commands

    • TraceBuilder
    • Folder
    • +
    • Anonymizer

    Firstly, we need to generate the Gold Trace. Hence the first @@ -139,8 +150,9 @@ The output of the TraceBuilder is a job-trace file (and an optional cluster-topology file). In case we want to scale the output, we can use the Folder utility to fold the current trace to the - desired length. The remaining part of this section explains these - utilities in detail. + desired length. For anonymizing the trace, use the + Anonymizer utility. The remaining part of this section + explains these utilities in detail.

    Examples in this section assumes that certain libraries are present @@ -426,8 +438,156 @@

    +

    +

    +

    +

    +

    + + + + +
    + Anonymizer + +

    Command:

    + java org.apache.hadoop.tools.rumen.Anonymizer [options] [-trace <jobtrace-input> <jobtrace-output>] [-topology <topology-input> <topology-output>] + +

    This command invokes the Anonymizer utility of + Rumen. It anonymizes sensitive information from the + <jobtrace-input> file and outputs the anonymized + content into the <jobtrace-output> + file. It also anonymizes the cluster layout (topology) from the + <topology-input> and outputs it in + the <topology-output> file. + <job-input> represents the job trace file obtained + using TraceBuilder or Folder. + <topology-input> represents the cluster topology + file obtained using TraceBuilder. +

    + +

    Options :

    + + + + + + + + + + + + + + + + +
    ParameterDescriptionNotes
    -traceAnonymizes job traces.Anonymizes sensitive fields like user-name, job-name, queue-name + host-names, job configuration parameters etc.
    -topologyAnonymizes cluster topologyAnonymizes rack-names and host-names.
    + +
    + <em>Anonymizer</em> Configuration Parameters +

    The Rumen anonymizer can be configured using the following + configuration parameters: +

    + + + + + + + + + + + + + + + + + + + + + + + + + +
    ParameterDescription
    + rumen.data-types.classname.preserve + A comma separated list of prefixes that the Anonymizer + will not anonymize while processing classnames. If + rumen.data-types.classname.preserve is set to + 'org.apache,com.hadoop.' then + classnames starting with 'org.apache' or + 'com.hadoop.' will not be anonymized. +
    + rumen.datatypes.jobproperties.parsers + A comma separated list of job properties parsers. These parsers + decide how the job configuration parameters + (i.e <key,value> pairs) should be processed. Default is + MapReduceJobPropertiesParser. The default parser will + only parse framework-level MapReduce specific job configuration + properties. Users can add custom parsers by implementing the + JobPropertiesParser interface. Rumen also provides an + all-pass (i.e no filter) parser called + DefaultJobPropertiesParser. +
    + rumen.anonymization.states.dir + Set this to a location (on LocalFileSystem or HDFS) for enabling + state persistence and/or reload. This parameter is not set by + default. Reloading and persistence of states depend on the state + directory. Note that the state directory will contain the latest + as well as previous states. +
    + rumen.anonymization.states.persist + Set this to 'true' to persist the current state. + Default value is 'false'. Note that the states will + be persisted to the state manager's state directory + specified using the rumen.anonymization.states.dir + parameter. +
    + rumen.anonymization.states.reload + Set this to 'true' to enable reuse of previously + persisted state. The default value is 'false'. The + previously persisted state will be reloaded from the state + manager's state directory specified using the + rumen.anonymization.states.dir parameter. Note that + the Anonymizer will bail out if it fails to find any + previously persisted state in the state directory or if the state + directory is not set. If the user wishes to retain/reuse the + states across multiple invocations of the Anonymizer, + then the very first invocation of the Anonymizer should + have rumen.anonymization.states.reload set to + 'false' and + rumen.anonymization.states.persist set to + 'true'. Subsequent invocations of the + Anonymizer can then have + rumen.anonymization.states.reload set to + 'true'. +
    +
    + +
    + Example + java org.apache.hadoop.tools.rumen.Anonymizer -trace file:///home/user/job-trace.json file:///home/user/job-trace-anonymized.json -topology file:///home/user/cluster-topology.json file:///home/user/cluster-topology-anonymized.json +

    +

    This will anonymize the job details from + file:///home/user/job-trace.json and output it to + file:///home/user/job-trace-anonymized.json. + It will also anonymize the cluster topology layout from + file:///home/user/cluster-topology.json and output it to + file:///home/user/cluster-topology-anonymized.json. + Note that the Anonymizer also supports input and output + files on HDFS. +

    +
    -