MAPREDUCE-5157. Bring back old sampler related code so that we can support binary compatibility with hadoop-1 sorter example. Contributed by Zhijie Shen.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1480474 13f79535-47bb-0310-9956-ffa450edef68
2013-05-08 21:42:47 +00:00 · 2013-05-08 21:42:47 +00:00 · f6167ef6c6
parent 67d7d8f4d3
commit f6167ef6c6
4 changed files with 384 additions and 11 deletions
--- a/hadoop-mapreduce-project/CHANGES.txt
+++ b/hadoop-mapreduce-project/CHANGES.txt
@ -224,6 +224,9 @@ Release 2.0.5-beta - UNRELEASED
    MAPREDUCE-5159. Change ValueAggregatorJob to add APIs which can support
    binary compatibility with hadoop-1 examples. (Zhijie Shen via vinodkv)
    MAPREDUCE-5157. Bring back old sampler related code so that we can support
    binary compatibility with hadoop-1 sorter example. (Zhijie Shen via vinodkv)
  OPTIMIZATIONS
    MAPREDUCE-4974. Optimising the LineRecordReader initialize() method 
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/lib/InputSampler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/lib/InputSampler.java
@ -19,10 +19,18 @@
 package org.apache.hadoop.mapred.lib;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Random;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.mapred.InputFormat;
 import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RecordReader;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapreduce.Job;
@InterfaceAudience.Public
@ -30,6 +38,8 @@ import org.apache.hadoop.mapreduce.Job;
 public class InputSampler<K,V> extends 
  org.apache.hadoop.mapreduce.lib.partition.InputSampler<K, V> {
  private static final Log LOG = LogFactory.getLog(InputSampler.class);
  public InputSampler(JobConf conf) {
    super(conf);
  }
@ -38,4 +48,219 @@ public class InputSampler<K,V> extends
      throws IOException, ClassNotFoundException, InterruptedException {
    writePartitionFile(new Job(job), sampler);
  }
  /**
   * Interface to sample using an {@link org.apache.hadoop.mapred.InputFormat}.
   */
  public interface Sampler<K,V> extends
    org.apache.hadoop.mapreduce.lib.partition.InputSampler.Sampler<K, V> {
    /**
     * For a given job, collect and return a subset of the keys from the
     * input data.
     */
    K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException;
  }
  /**
   * Samples the first n records from s splits.
   * Inexpensive way to sample random data.
   */
  public static class SplitSampler<K,V> extends
      org.apache.hadoop.mapreduce.lib.partition.InputSampler.SplitSampler<K, V>
          implements Sampler<K,V> {
    /**
     * Create a SplitSampler sampling <em>all</em> splits.
     * Takes the first numSamples / numSplits records from each split.
     * @param numSamples Total number of samples to obtain from all selected
     *                   splits.
     */
    public SplitSampler(int numSamples) {
      this(numSamples, Integer.MAX_VALUE);
    }
    /**
     * Create a new SplitSampler.
     * @param numSamples Total number of samples to obtain from all selected
     *                   splits.
     * @param maxSplitsSampled The maximum number of splits to examine.
     */
    public SplitSampler(int numSamples, int maxSplitsSampled) {
      super(numSamples, maxSplitsSampled);
    }
    /**
     * From each split sampled, take the first numSamples / numSplits records.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
      InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
      ArrayList<K> samples = new ArrayList<K>(numSamples);
      int splitsToSample = Math.min(maxSplitsSampled, splits.length);
      int splitStep = splits.length / splitsToSample;
      int samplesPerSplit = numSamples / splitsToSample;
      long records = 0;
      for (int i = 0; i < splitsToSample; ++i) {
        RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
            job, Reporter.NULL);
        K key = reader.createKey();
        V value = reader.createValue();
        while (reader.next(key, value)) {
          samples.add(key);
          key = reader.createKey();
          ++records;
          if ((i+1) * samplesPerSplit <= records) {
            break;
          }
        }
        reader.close();
      }
      return (K[])samples.toArray();
    }
  }
  /**
   * Sample from random points in the input.
   * General-purpose sampler. Takes numSamples / maxSplitsSampled inputs from
   * each split.
   */
  public static class RandomSampler<K,V> extends
      org.apache.hadoop.mapreduce.lib.partition.InputSampler.RandomSampler<K, V>
          implements Sampler<K,V> {
    /**
     * Create a new RandomSampler sampling <em>all</em> splits.
     * This will read every split at the client, which is very expensive.
     * @param freq Probability with which a key will be chosen.
     * @param numSamples Total number of samples to obtain from all selected
     *                   splits.
     */
    public RandomSampler(double freq, int numSamples) {
      this(freq, numSamples, Integer.MAX_VALUE);
    }
    /**
     * Create a new RandomSampler.
     * @param freq Probability with which a key will be chosen.
     * @param numSamples Total number of samples to obtain from all selected
     *                   splits.
     * @param maxSplitsSampled The maximum number of splits to examine.
     */
    public RandomSampler(double freq, int numSamples, int maxSplitsSampled) {
      super(freq, numSamples, maxSplitsSampled);
    }
    /**
     * Randomize the split order, then take the specified number of keys from
     * each split sampled, where each key is selected with the specified
     * probability and possibly replaced by a subsequently selected key when
     * the quota of keys from that split is satisfied.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
      InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
      ArrayList<K> samples = new ArrayList<K>(numSamples);
      int splitsToSample = Math.min(maxSplitsSampled, splits.length);
      Random r = new Random();
      long seed = r.nextLong();
      r.setSeed(seed);
      LOG.debug("seed: " + seed);
      // shuffle splits
      for (int i = 0; i < splits.length; ++i) {
        InputSplit tmp = splits[i];
        int j = r.nextInt(splits.length);
        splits[i] = splits[j];
        splits[j] = tmp;
      }
      // our target rate is in terms of the maximum number of sample splits,
      // but we accept the possibility of sampling additional splits to hit
      // the target sample keyset
      for (int i = 0; i < splitsToSample ||
                     (i < splits.length && samples.size() < numSamples); ++i) {
        RecordReader<K,V> reader = inf.getRecordReader(splits[i], job,
            Reporter.NULL);
        K key = reader.createKey();
        V value = reader.createValue();
        while (reader.next(key, value)) {
          if (r.nextDouble() <= freq) {
            if (samples.size() < numSamples) {
              samples.add(key);
            } else {
              // When exceeding the maximum number of samples, replace a
              // random element with this one, then adjust the frequency
              // to reflect the possibility of existing elements being
              // pushed out
              int ind = r.nextInt(numSamples);
              if (ind != numSamples) {
                samples.set(ind, key);
              }
              freq *= (numSamples - 1) / (double) numSamples;
            }
            key = reader.createKey();
          }
        }
        reader.close();
      }
      return (K[])samples.toArray();
    }
  }
  /**
   * Sample from s splits at regular intervals.
   * Useful for sorted data.
   */
  public static class IntervalSampler<K,V> extends
      org.apache.hadoop.mapreduce.lib.partition.InputSampler.IntervalSampler<K, V>
          implements Sampler<K,V> {
    /**
     * Create a new IntervalSampler sampling <em>all</em> splits.
     * @param freq The frequency with which records will be emitted.
     */
    public IntervalSampler(double freq) {
      this(freq, Integer.MAX_VALUE);
    }
    /**
     * Create a new IntervalSampler.
     * @param freq The frequency with which records will be emitted.
     * @param maxSplitsSampled The maximum number of splits to examine.
     * @see #getSample
     */
    public IntervalSampler(double freq, int maxSplitsSampled) {
      super(freq, maxSplitsSampled);
    }
    /**
     * For each split sampled, emit when the ratio of the number of records
     * retained to the total record count is less than the specified
     * frequency.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
      InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
      ArrayList<K> samples = new ArrayList<K>();
      int splitsToSample = Math.min(maxSplitsSampled, splits.length);
      int splitStep = splits.length / splitsToSample;
      long records = 0;
      long kept = 0;
      for (int i = 0; i < splitsToSample; ++i) {
        RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
            job, Reporter.NULL);
        K key = reader.createKey();
        V value = reader.createValue();
        while (reader.next(key, value)) {
          ++records;
          if ((double) kept / records < freq) {
            ++kept;
            samples.add(key);
            key = reader.createKey();
          }
        }
        reader.close();
      }
      return (K[])samples.toArray();
    }
  }
 }
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/partition/InputSampler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/partition/InputSampler.java
@ -96,8 +96,8 @@ public class InputSampler<K,V> extends Configured implements Tool  {
   */
  public static class SplitSampler<K,V> implements Sampler<K,V> {
-    private final int numSamples;
+    protected final int numSamples;
-    private final int maxSplitsSampled;
+    protected final int maxSplitsSampled;
    /**
     * Create a SplitSampler sampling <em>all</em> splits.
@ -157,9 +157,9 @@ public class InputSampler<K,V> extends Configured implements Tool  {
   * each split.
   */
  public static class RandomSampler<K,V> implements Sampler<K,V> {
-    private double freq;
+    protected double freq;
-    private final int numSamples;
+    protected final int numSamples;
-    private final int maxSplitsSampled;
+    protected final int maxSplitsSampled;
    /**
     * Create a new RandomSampler sampling <em>all</em> splits.
@ -249,8 +249,8 @@ public class InputSampler<K,V> extends Configured implements Tool  {
   * Useful for sorted data.
   */
  public static class IntervalSampler<K,V> implements Sampler<K,V> {
-    private final double freq;
+    protected final double freq;
-    private final int maxSplitsSampled;
+    protected final int maxSplitsSampled;
    /**
     * Create a new IntervalSampler sampling <em>all</em> splits.
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/lib/partition/TestInputSampler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapreduce/lib/partition/TestInputSampler.java
@ -17,23 +17,26 @@
 */
 package org.apache.hadoop.mapreduce.lib.partition;
 import static org.junit.Assert.assertEquals;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import org.junit.Test;
 import static org.junit.Assert.*;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.junit.Test;
 public class TestInputSampler {
@ -47,6 +50,24 @@ public class TestInputSampler {
    public int getInit() { return i; }
  }
  static class MapredSequentialSplit implements org.apache.hadoop.mapred.InputSplit {
    private int i;
    MapredSequentialSplit(int i) {
      this.i = i;
    }
    @Override
    public long getLength() { return 0; }
    @Override
    public String[] getLocations() { return new String[0]; }
    public int getInit() { return i; }
    @Override
    public void write(DataOutput out) throws IOException {
    }
    @Override
    public void readFields(DataInput in) throws IOException {
    }
  }
  static class TestInputSamplerIF
      extends InputFormat<IntWritable,NullWritable> {
@ -90,6 +111,71 @@ public class TestInputSampler {
  }
  static class TestMapredInputSamplerIF extends TestInputSamplerIF implements
      org.apache.hadoop.mapred.InputFormat<IntWritable,NullWritable> {
    TestMapredInputSamplerIF(int maxDepth, int numSplits, int... splitInit) {
      super(maxDepth, numSplits, splitInit);
    }
    @Override
    public org.apache.hadoop.mapred.InputSplit[] getSplits(JobConf job,
        int numSplits) throws IOException {
      List<InputSplit> splits = null;
      try {
        splits = getSplits(Job.getInstance(job));
      } catch (InterruptedException e) {
        throw new IOException(e);
      }
      org.apache.hadoop.mapred.InputSplit[] retVals =
          new org.apache.hadoop.mapred.InputSplit[splits.size()];
      for (int i = 0; i < splits.size(); ++i) {
        MapredSequentialSplit split = new MapredSequentialSplit(
            ((SequentialSplit) splits.get(i)).getInit());
        retVals[i] = split;
      }
      return retVals;
    }
    @Override
    public org.apache.hadoop.mapred.RecordReader<IntWritable, NullWritable>
        getRecordReader(final org.apache.hadoop.mapred.InputSplit split,
            JobConf job, Reporter reporter) throws IOException {
      return new org.apache.hadoop.mapred.RecordReader
          <IntWritable, NullWritable>() {
        private final IntWritable i =
            new IntWritable(((MapredSequentialSplit)split).getInit());
        private int maxVal = i.get() + maxDepth + 1;
        @Override
        public boolean next(IntWritable key, NullWritable value)
            throws IOException {
          i.set(i.get() + 1);
          return i.get() < maxVal;
        }
        @Override
        public IntWritable createKey() {
          return new IntWritable(i.get());
        }
        @Override
        public NullWritable createValue() {
          return NullWritable.get();
        }
        @Override
        public long getPos() throws IOException {
          return 0;
        }
        @Override
        public void close() throws IOException {
        }
        @Override
        public float getProgress() throws IOException {
          return 0;
        }
      };
    }
  }
  /**
   * Verify SplitSampler contract, that an equal number of records are taken
   * from the first splits.
@ -118,6 +204,36 @@ public class TestInputSampler {
    }
  }
  /**
   * Verify SplitSampler contract in mapred.lib.InputSampler, which is added
   * back for binary compatibility of M/R 1.x
   */
  @Test (timeout = 30000)
  @SuppressWarnings("unchecked") // IntWritable comparator not typesafe
  public void testMapredSplitSampler() throws Exception {
    final int TOT_SPLITS = 15;
    final int NUM_SPLITS = 5;
    final int STEP_SAMPLE = 5;
    final int NUM_SAMPLES = NUM_SPLITS * STEP_SAMPLE;
    org.apache.hadoop.mapred.lib.InputSampler.Sampler<IntWritable,NullWritable>
        sampler = new org.apache.hadoop.mapred.lib.InputSampler.SplitSampler
            <IntWritable,NullWritable>(NUM_SAMPLES, NUM_SPLITS);
    int inits[] = new int[TOT_SPLITS];
    for (int i = 0; i < TOT_SPLITS; ++i) {
      inits[i] = i * STEP_SAMPLE;
    }
    Object[] samples = sampler.getSample(
        new TestMapredInputSamplerIF(100000, TOT_SPLITS, inits),
        new JobConf());
    assertEquals(NUM_SAMPLES, samples.length);
    Arrays.sort(samples, new IntWritable.Comparator());
    for (int i = 0; i < NUM_SAMPLES; ++i) {
      // mapred.lib.InputSampler.SplitSampler has a sampling step
      assertEquals(i % STEP_SAMPLE + TOT_SPLITS * (i / STEP_SAMPLE),
          ((IntWritable)samples[i]).get());
    }
  }
  /**
   * Verify IntervalSampler contract, that samples are taken at regular
   * intervals from the given splits.
@ -146,4 +262,33 @@ public class TestInputSampler {
    }
  }
  /**
   * Verify IntervalSampler in mapred.lib.InputSampler, which is added back
   * for binary compatibility of M/R 1.x
   */
  @Test (timeout = 30000)
  @SuppressWarnings("unchecked") // IntWritable comparator not typesafe
  public void testMapredIntervalSampler() throws Exception {
    final int TOT_SPLITS = 16;
    final int PER_SPLIT_SAMPLE = 4;
    final int NUM_SAMPLES = TOT_SPLITS * PER_SPLIT_SAMPLE;
    final double FREQ = 1.0 / TOT_SPLITS;
    org.apache.hadoop.mapred.lib.InputSampler.Sampler<IntWritable,NullWritable>
        sampler = new org.apache.hadoop.mapred.lib.InputSampler.IntervalSampler
            <IntWritable,NullWritable>(FREQ, NUM_SAMPLES);
    int inits[] = new int[TOT_SPLITS];
    for (int i = 0; i < TOT_SPLITS; ++i) {
      inits[i] = i;
    }
    Job ignored = Job.getInstance();
    Object[] samples = sampler.getSample(new TestInputSamplerIF(
          NUM_SAMPLES, TOT_SPLITS, inits), ignored);
    assertEquals(NUM_SAMPLES, samples.length);
    Arrays.sort(samples, new IntWritable.Comparator());
    for (int i = 0; i < NUM_SAMPLES; ++i) {
      assertEquals(i,
          ((IntWritable)samples[i]).get());
    }
  }
 }