MAPREDUCE-5157. Bring back old sampler related code so that we can support binary compatibility with hadoop-1 sorter example. Contributed by Zhijie Shen.
svn merge --ignore-ancestry -c 1480474 ../../trunk/ git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1480476 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5cf0e57849
commit
4a0965323c
|
@ -62,6 +62,9 @@ Release 2.0.5-beta - UNRELEASED
|
||||||
MAPREDUCE-5159. Change ValueAggregatorJob to add APIs which can support
|
MAPREDUCE-5159. Change ValueAggregatorJob to add APIs which can support
|
||||||
binary compatibility with hadoop-1 examples. (Zhijie Shen via vinodkv)
|
binary compatibility with hadoop-1 examples. (Zhijie Shen via vinodkv)
|
||||||
|
|
||||||
|
MAPREDUCE-5157. Bring back old sampler related code so that we can support
|
||||||
|
binary compatibility with hadoop-1 sorter example. (Zhijie Shen via vinodkv)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
MAPREDUCE-4974. Optimising the LineRecordReader initialize() method
|
MAPREDUCE-4974. Optimising the LineRecordReader initialize() method
|
||||||
|
|
|
@ -19,10 +19,18 @@
|
||||||
package org.apache.hadoop.mapred.lib;
|
package org.apache.hadoop.mapred.lib;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.classification.InterfaceStability;
|
import org.apache.hadoop.classification.InterfaceStability;
|
||||||
|
import org.apache.hadoop.mapred.InputFormat;
|
||||||
|
import org.apache.hadoop.mapred.InputSplit;
|
||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
|
import org.apache.hadoop.mapred.RecordReader;
|
||||||
|
import org.apache.hadoop.mapred.Reporter;
|
||||||
import org.apache.hadoop.mapreduce.Job;
|
import org.apache.hadoop.mapreduce.Job;
|
||||||
|
|
||||||
@InterfaceAudience.Public
|
@InterfaceAudience.Public
|
||||||
|
@ -30,6 +38,8 @@ import org.apache.hadoop.mapreduce.Job;
|
||||||
public class InputSampler<K,V> extends
|
public class InputSampler<K,V> extends
|
||||||
org.apache.hadoop.mapreduce.lib.partition.InputSampler<K, V> {
|
org.apache.hadoop.mapreduce.lib.partition.InputSampler<K, V> {
|
||||||
|
|
||||||
|
private static final Log LOG = LogFactory.getLog(InputSampler.class);
|
||||||
|
|
||||||
public InputSampler(JobConf conf) {
|
public InputSampler(JobConf conf) {
|
||||||
super(conf);
|
super(conf);
|
||||||
}
|
}
|
||||||
|
@ -38,4 +48,219 @@ public class InputSampler<K,V> extends
|
||||||
throws IOException, ClassNotFoundException, InterruptedException {
|
throws IOException, ClassNotFoundException, InterruptedException {
|
||||||
writePartitionFile(new Job(job), sampler);
|
writePartitionFile(new Job(job), sampler);
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Interface to sample using an {@link org.apache.hadoop.mapred.InputFormat}.
|
||||||
|
*/
|
||||||
|
public interface Sampler<K,V> extends
|
||||||
|
org.apache.hadoop.mapreduce.lib.partition.InputSampler.Sampler<K, V> {
|
||||||
|
/**
|
||||||
|
* For a given job, collect and return a subset of the keys from the
|
||||||
|
* input data.
|
||||||
|
*/
|
||||||
|
K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Samples the first n records from s splits.
|
||||||
|
* Inexpensive way to sample random data.
|
||||||
|
*/
|
||||||
|
public static class SplitSampler<K,V> extends
|
||||||
|
org.apache.hadoop.mapreduce.lib.partition.InputSampler.SplitSampler<K, V>
|
||||||
|
implements Sampler<K,V> {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a SplitSampler sampling <em>all</em> splits.
|
||||||
|
* Takes the first numSamples / numSplits records from each split.
|
||||||
|
* @param numSamples Total number of samples to obtain from all selected
|
||||||
|
* splits.
|
||||||
|
*/
|
||||||
|
public SplitSampler(int numSamples) {
|
||||||
|
this(numSamples, Integer.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new SplitSampler.
|
||||||
|
* @param numSamples Total number of samples to obtain from all selected
|
||||||
|
* splits.
|
||||||
|
* @param maxSplitsSampled The maximum number of splits to examine.
|
||||||
|
*/
|
||||||
|
public SplitSampler(int numSamples, int maxSplitsSampled) {
|
||||||
|
super(numSamples, maxSplitsSampled);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* From each split sampled, take the first numSamples / numSplits records.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
|
||||||
|
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
|
||||||
|
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
|
||||||
|
ArrayList<K> samples = new ArrayList<K>(numSamples);
|
||||||
|
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
|
||||||
|
int splitStep = splits.length / splitsToSample;
|
||||||
|
int samplesPerSplit = numSamples / splitsToSample;
|
||||||
|
long records = 0;
|
||||||
|
for (int i = 0; i < splitsToSample; ++i) {
|
||||||
|
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
|
||||||
|
job, Reporter.NULL);
|
||||||
|
K key = reader.createKey();
|
||||||
|
V value = reader.createValue();
|
||||||
|
while (reader.next(key, value)) {
|
||||||
|
samples.add(key);
|
||||||
|
key = reader.createKey();
|
||||||
|
++records;
|
||||||
|
if ((i+1) * samplesPerSplit <= records) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
return (K[])samples.toArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sample from random points in the input.
|
||||||
|
* General-purpose sampler. Takes numSamples / maxSplitsSampled inputs from
|
||||||
|
* each split.
|
||||||
|
*/
|
||||||
|
public static class RandomSampler<K,V> extends
|
||||||
|
org.apache.hadoop.mapreduce.lib.partition.InputSampler.RandomSampler<K, V>
|
||||||
|
implements Sampler<K,V> {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new RandomSampler sampling <em>all</em> splits.
|
||||||
|
* This will read every split at the client, which is very expensive.
|
||||||
|
* @param freq Probability with which a key will be chosen.
|
||||||
|
* @param numSamples Total number of samples to obtain from all selected
|
||||||
|
* splits.
|
||||||
|
*/
|
||||||
|
public RandomSampler(double freq, int numSamples) {
|
||||||
|
this(freq, numSamples, Integer.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new RandomSampler.
|
||||||
|
* @param freq Probability with which a key will be chosen.
|
||||||
|
* @param numSamples Total number of samples to obtain from all selected
|
||||||
|
* splits.
|
||||||
|
* @param maxSplitsSampled The maximum number of splits to examine.
|
||||||
|
*/
|
||||||
|
public RandomSampler(double freq, int numSamples, int maxSplitsSampled) {
|
||||||
|
super(freq, numSamples, maxSplitsSampled);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Randomize the split order, then take the specified number of keys from
|
||||||
|
* each split sampled, where each key is selected with the specified
|
||||||
|
* probability and possibly replaced by a subsequently selected key when
|
||||||
|
* the quota of keys from that split is satisfied.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
|
||||||
|
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
|
||||||
|
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
|
||||||
|
ArrayList<K> samples = new ArrayList<K>(numSamples);
|
||||||
|
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
|
||||||
|
|
||||||
|
Random r = new Random();
|
||||||
|
long seed = r.nextLong();
|
||||||
|
r.setSeed(seed);
|
||||||
|
LOG.debug("seed: " + seed);
|
||||||
|
// shuffle splits
|
||||||
|
for (int i = 0; i < splits.length; ++i) {
|
||||||
|
InputSplit tmp = splits[i];
|
||||||
|
int j = r.nextInt(splits.length);
|
||||||
|
splits[i] = splits[j];
|
||||||
|
splits[j] = tmp;
|
||||||
|
}
|
||||||
|
// our target rate is in terms of the maximum number of sample splits,
|
||||||
|
// but we accept the possibility of sampling additional splits to hit
|
||||||
|
// the target sample keyset
|
||||||
|
for (int i = 0; i < splitsToSample ||
|
||||||
|
(i < splits.length && samples.size() < numSamples); ++i) {
|
||||||
|
RecordReader<K,V> reader = inf.getRecordReader(splits[i], job,
|
||||||
|
Reporter.NULL);
|
||||||
|
K key = reader.createKey();
|
||||||
|
V value = reader.createValue();
|
||||||
|
while (reader.next(key, value)) {
|
||||||
|
if (r.nextDouble() <= freq) {
|
||||||
|
if (samples.size() < numSamples) {
|
||||||
|
samples.add(key);
|
||||||
|
} else {
|
||||||
|
// When exceeding the maximum number of samples, replace a
|
||||||
|
// random element with this one, then adjust the frequency
|
||||||
|
// to reflect the possibility of existing elements being
|
||||||
|
// pushed out
|
||||||
|
int ind = r.nextInt(numSamples);
|
||||||
|
if (ind != numSamples) {
|
||||||
|
samples.set(ind, key);
|
||||||
|
}
|
||||||
|
freq *= (numSamples - 1) / (double) numSamples;
|
||||||
|
}
|
||||||
|
key = reader.createKey();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
return (K[])samples.toArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sample from s splits at regular intervals.
|
||||||
|
* Useful for sorted data.
|
||||||
|
*/
|
||||||
|
public static class IntervalSampler<K,V> extends
|
||||||
|
org.apache.hadoop.mapreduce.lib.partition.InputSampler.IntervalSampler<K, V>
|
||||||
|
implements Sampler<K,V> {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new IntervalSampler sampling <em>all</em> splits.
|
||||||
|
* @param freq The frequency with which records will be emitted.
|
||||||
|
*/
|
||||||
|
public IntervalSampler(double freq) {
|
||||||
|
this(freq, Integer.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new IntervalSampler.
|
||||||
|
* @param freq The frequency with which records will be emitted.
|
||||||
|
* @param maxSplitsSampled The maximum number of splits to examine.
|
||||||
|
* @see #getSample
|
||||||
|
*/
|
||||||
|
public IntervalSampler(double freq, int maxSplitsSampled) {
|
||||||
|
super(freq, maxSplitsSampled);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each split sampled, emit when the ratio of the number of records
|
||||||
|
* retained to the total record count is less than the specified
|
||||||
|
* frequency.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
|
||||||
|
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
|
||||||
|
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
|
||||||
|
ArrayList<K> samples = new ArrayList<K>();
|
||||||
|
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
|
||||||
|
int splitStep = splits.length / splitsToSample;
|
||||||
|
long records = 0;
|
||||||
|
long kept = 0;
|
||||||
|
for (int i = 0; i < splitsToSample; ++i) {
|
||||||
|
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
|
||||||
|
job, Reporter.NULL);
|
||||||
|
K key = reader.createKey();
|
||||||
|
V value = reader.createValue();
|
||||||
|
while (reader.next(key, value)) {
|
||||||
|
++records;
|
||||||
|
if ((double) kept / records < freq) {
|
||||||
|
++kept;
|
||||||
|
samples.add(key);
|
||||||
|
key = reader.createKey();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
return (K[])samples.toArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,8 +96,8 @@ public class InputSampler<K,V> extends Configured implements Tool {
|
||||||
*/
|
*/
|
||||||
public static class SplitSampler<K,V> implements Sampler<K,V> {
|
public static class SplitSampler<K,V> implements Sampler<K,V> {
|
||||||
|
|
||||||
private final int numSamples;
|
protected final int numSamples;
|
||||||
private final int maxSplitsSampled;
|
protected final int maxSplitsSampled;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a SplitSampler sampling <em>all</em> splits.
|
* Create a SplitSampler sampling <em>all</em> splits.
|
||||||
|
@ -157,9 +157,9 @@ public class InputSampler<K,V> extends Configured implements Tool {
|
||||||
* each split.
|
* each split.
|
||||||
*/
|
*/
|
||||||
public static class RandomSampler<K,V> implements Sampler<K,V> {
|
public static class RandomSampler<K,V> implements Sampler<K,V> {
|
||||||
private double freq;
|
protected double freq;
|
||||||
private final int numSamples;
|
protected final int numSamples;
|
||||||
private final int maxSplitsSampled;
|
protected final int maxSplitsSampled;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new RandomSampler sampling <em>all</em> splits.
|
* Create a new RandomSampler sampling <em>all</em> splits.
|
||||||
|
@ -249,8 +249,8 @@ public class InputSampler<K,V> extends Configured implements Tool {
|
||||||
* Useful for sorted data.
|
* Useful for sorted data.
|
||||||
*/
|
*/
|
||||||
public static class IntervalSampler<K,V> implements Sampler<K,V> {
|
public static class IntervalSampler<K,V> implements Sampler<K,V> {
|
||||||
private final double freq;
|
protected final double freq;
|
||||||
private final int maxSplitsSampled;
|
protected final int maxSplitsSampled;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new IntervalSampler sampling <em>all</em> splits.
|
* Create a new IntervalSampler sampling <em>all</em> splits.
|
||||||
|
|
|
@ -17,23 +17,26 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.mapreduce.lib.partition;
|
package org.apache.hadoop.mapreduce.lib.partition;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
import java.io.DataInput;
|
||||||
|
import java.io.DataOutput;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.junit.Test;
|
|
||||||
import static org.junit.Assert.*;
|
|
||||||
|
|
||||||
import org.apache.hadoop.io.IntWritable;
|
import org.apache.hadoop.io.IntWritable;
|
||||||
import org.apache.hadoop.io.NullWritable;
|
import org.apache.hadoop.io.NullWritable;
|
||||||
import org.apache.hadoop.io.WritableComparable;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
|
import org.apache.hadoop.mapred.Reporter;
|
||||||
import org.apache.hadoop.mapreduce.InputFormat;
|
import org.apache.hadoop.mapreduce.InputFormat;
|
||||||
import org.apache.hadoop.mapreduce.InputSplit;
|
import org.apache.hadoop.mapreduce.InputSplit;
|
||||||
import org.apache.hadoop.mapreduce.Job;
|
import org.apache.hadoop.mapreduce.Job;
|
||||||
import org.apache.hadoop.mapreduce.JobContext;
|
import org.apache.hadoop.mapreduce.JobContext;
|
||||||
import org.apache.hadoop.mapreduce.RecordReader;
|
import org.apache.hadoop.mapreduce.RecordReader;
|
||||||
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestInputSampler {
|
public class TestInputSampler {
|
||||||
|
|
||||||
|
@ -47,6 +50,24 @@ public class TestInputSampler {
|
||||||
public int getInit() { return i; }
|
public int getInit() { return i; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class MapredSequentialSplit implements org.apache.hadoop.mapred.InputSplit {
|
||||||
|
private int i;
|
||||||
|
MapredSequentialSplit(int i) {
|
||||||
|
this.i = i;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public long getLength() { return 0; }
|
||||||
|
@Override
|
||||||
|
public String[] getLocations() { return new String[0]; }
|
||||||
|
public int getInit() { return i; }
|
||||||
|
@Override
|
||||||
|
public void write(DataOutput out) throws IOException {
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public void readFields(DataInput in) throws IOException {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static class TestInputSamplerIF
|
static class TestInputSamplerIF
|
||||||
extends InputFormat<IntWritable,NullWritable> {
|
extends InputFormat<IntWritable,NullWritable> {
|
||||||
|
|
||||||
|
@ -90,6 +111,71 @@ public class TestInputSampler {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class TestMapredInputSamplerIF extends TestInputSamplerIF implements
|
||||||
|
org.apache.hadoop.mapred.InputFormat<IntWritable,NullWritable> {
|
||||||
|
|
||||||
|
TestMapredInputSamplerIF(int maxDepth, int numSplits, int... splitInit) {
|
||||||
|
super(maxDepth, numSplits, splitInit);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public org.apache.hadoop.mapred.InputSplit[] getSplits(JobConf job,
|
||||||
|
int numSplits) throws IOException {
|
||||||
|
List<InputSplit> splits = null;
|
||||||
|
try {
|
||||||
|
splits = getSplits(Job.getInstance(job));
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new IOException(e);
|
||||||
|
}
|
||||||
|
org.apache.hadoop.mapred.InputSplit[] retVals =
|
||||||
|
new org.apache.hadoop.mapred.InputSplit[splits.size()];
|
||||||
|
for (int i = 0; i < splits.size(); ++i) {
|
||||||
|
MapredSequentialSplit split = new MapredSequentialSplit(
|
||||||
|
((SequentialSplit) splits.get(i)).getInit());
|
||||||
|
retVals[i] = split;
|
||||||
|
}
|
||||||
|
return retVals;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public org.apache.hadoop.mapred.RecordReader<IntWritable, NullWritable>
|
||||||
|
getRecordReader(final org.apache.hadoop.mapred.InputSplit split,
|
||||||
|
JobConf job, Reporter reporter) throws IOException {
|
||||||
|
return new org.apache.hadoop.mapred.RecordReader
|
||||||
|
<IntWritable, NullWritable>() {
|
||||||
|
private final IntWritable i =
|
||||||
|
new IntWritable(((MapredSequentialSplit)split).getInit());
|
||||||
|
private int maxVal = i.get() + maxDepth + 1;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean next(IntWritable key, NullWritable value)
|
||||||
|
throws IOException {
|
||||||
|
i.set(i.get() + 1);
|
||||||
|
return i.get() < maxVal;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public IntWritable createKey() {
|
||||||
|
return new IntWritable(i.get());
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public NullWritable createValue() {
|
||||||
|
return NullWritable.get();
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public long getPos() throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public float getProgress() throws IOException {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verify SplitSampler contract, that an equal number of records are taken
|
* Verify SplitSampler contract, that an equal number of records are taken
|
||||||
* from the first splits.
|
* from the first splits.
|
||||||
|
@ -118,6 +204,36 @@ public class TestInputSampler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify SplitSampler contract in mapred.lib.InputSampler, which is added
|
||||||
|
* back for binary compatibility of M/R 1.x
|
||||||
|
*/
|
||||||
|
@Test (timeout = 30000)
|
||||||
|
@SuppressWarnings("unchecked") // IntWritable comparator not typesafe
|
||||||
|
public void testMapredSplitSampler() throws Exception {
|
||||||
|
final int TOT_SPLITS = 15;
|
||||||
|
final int NUM_SPLITS = 5;
|
||||||
|
final int STEP_SAMPLE = 5;
|
||||||
|
final int NUM_SAMPLES = NUM_SPLITS * STEP_SAMPLE;
|
||||||
|
org.apache.hadoop.mapred.lib.InputSampler.Sampler<IntWritable,NullWritable>
|
||||||
|
sampler = new org.apache.hadoop.mapred.lib.InputSampler.SplitSampler
|
||||||
|
<IntWritable,NullWritable>(NUM_SAMPLES, NUM_SPLITS);
|
||||||
|
int inits[] = new int[TOT_SPLITS];
|
||||||
|
for (int i = 0; i < TOT_SPLITS; ++i) {
|
||||||
|
inits[i] = i * STEP_SAMPLE;
|
||||||
|
}
|
||||||
|
Object[] samples = sampler.getSample(
|
||||||
|
new TestMapredInputSamplerIF(100000, TOT_SPLITS, inits),
|
||||||
|
new JobConf());
|
||||||
|
assertEquals(NUM_SAMPLES, samples.length);
|
||||||
|
Arrays.sort(samples, new IntWritable.Comparator());
|
||||||
|
for (int i = 0; i < NUM_SAMPLES; ++i) {
|
||||||
|
// mapred.lib.InputSampler.SplitSampler has a sampling step
|
||||||
|
assertEquals(i % STEP_SAMPLE + TOT_SPLITS * (i / STEP_SAMPLE),
|
||||||
|
((IntWritable)samples[i]).get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verify IntervalSampler contract, that samples are taken at regular
|
* Verify IntervalSampler contract, that samples are taken at regular
|
||||||
* intervals from the given splits.
|
* intervals from the given splits.
|
||||||
|
@ -146,4 +262,33 @@ public class TestInputSampler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify IntervalSampler in mapred.lib.InputSampler, which is added back
|
||||||
|
* for binary compatibility of M/R 1.x
|
||||||
|
*/
|
||||||
|
@Test (timeout = 30000)
|
||||||
|
@SuppressWarnings("unchecked") // IntWritable comparator not typesafe
|
||||||
|
public void testMapredIntervalSampler() throws Exception {
|
||||||
|
final int TOT_SPLITS = 16;
|
||||||
|
final int PER_SPLIT_SAMPLE = 4;
|
||||||
|
final int NUM_SAMPLES = TOT_SPLITS * PER_SPLIT_SAMPLE;
|
||||||
|
final double FREQ = 1.0 / TOT_SPLITS;
|
||||||
|
org.apache.hadoop.mapred.lib.InputSampler.Sampler<IntWritable,NullWritable>
|
||||||
|
sampler = new org.apache.hadoop.mapred.lib.InputSampler.IntervalSampler
|
||||||
|
<IntWritable,NullWritable>(FREQ, NUM_SAMPLES);
|
||||||
|
int inits[] = new int[TOT_SPLITS];
|
||||||
|
for (int i = 0; i < TOT_SPLITS; ++i) {
|
||||||
|
inits[i] = i;
|
||||||
|
}
|
||||||
|
Job ignored = Job.getInstance();
|
||||||
|
Object[] samples = sampler.getSample(new TestInputSamplerIF(
|
||||||
|
NUM_SAMPLES, TOT_SPLITS, inits), ignored);
|
||||||
|
assertEquals(NUM_SAMPLES, samples.length);
|
||||||
|
Arrays.sort(samples, new IntWritable.Comparator());
|
||||||
|
for (int i = 0; i < NUM_SAMPLES; ++i) {
|
||||||
|
assertEquals(i,
|
||||||
|
((IntWritable)samples[i]).get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue