Update DetermineHashedPartitionsJob.java

CombineTextInputFormat instead of TextInputFormat combines multiple splits for a single mapper and reduces the strain on hadoop platform. It greatly improves job completion time as there are fewer number of mappers to bookkeep.
This commit is contained in:
Deepak 2014-05-22 15:06:56 +05:30
parent 5ce80068d2
commit de0a7b27e7
1 changed files with 2 additions and 1 deletions

View File

@ -49,6 +49,7 @@ import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.joda.time.DateTime; import org.joda.time.DateTime;
@ -91,7 +92,7 @@ public class DetermineHashedPartitionsJob implements Jobby
); );
JobHelper.injectSystemProperties(groupByJob); JobHelper.injectSystemProperties(groupByJob);
groupByJob.setInputFormatClass(TextInputFormat.class); groupByJob.setInputFormatClass(CombineTextInputFormat.class);
groupByJob.setMapperClass(DetermineCardinalityMapper.class); groupByJob.setMapperClass(DetermineCardinalityMapper.class);
groupByJob.setMapOutputKeyClass(LongWritable.class); groupByJob.setMapOutputKeyClass(LongWritable.class);
groupByJob.setMapOutputValueClass(BytesWritable.class); groupByJob.setMapOutputValueClass(BytesWritable.class);