2017-10-23 02:16:20 -04:00
|
|
|
package com.baeldung;
|
|
|
|
|
|
|
|
import java.util.Arrays;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
|
|
|
import org.apache.spark.SparkConf;
|
|
|
|
import org.apache.spark.api.java.JavaPairRDD;
|
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
|
|
|
import org.apache.spark.api.java.function.*;
|
|
|
|
import org.apache.spark.api.java.function.Function2;
|
|
|
|
import org.apache.spark.api.java.function.PairFunction;
|
|
|
|
|
|
|
|
import scala.Tuple2;
|
|
|
|
|
|
|
|
public class WordCount {
|
|
|
|
|
|
|
|
private static final Pattern SPACE = Pattern.compile(" ");
|
|
|
|
|
2017-10-26 13:46:35 -04:00
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
|
if (args.length < 1) {
|
|
|
|
System.err.println("Usage: JavaWordCount <file>");
|
|
|
|
System.exit(1);
|
|
|
|
}
|
|
|
|
SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount")
|
|
|
|
.setMaster("local");
|
|
|
|
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
|
|
|
|
JavaRDD<String> lines = ctx.textFile(args[0], 1);
|
|
|
|
|
|
|
|
JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
|
|
|
|
JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1));
|
|
|
|
JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2);
|
|
|
|
List<Tuple2<String, Integer>> output = wordWithCount.collect();
|
|
|
|
for (Tuple2<?, ?> tuple : output) {
|
|
|
|
System.out.println(tuple._1() + ": " + tuple._2());
|
|
|
|
}
|
|
|
|
ctx.stop();
|
2017-10-23 02:16:20 -04:00
|
|
|
}
|
|
|
|
}
|