package com.baeldung; import java.util.Arrays; import java.util.List; import java.util.regex.Pattern; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.*; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; public class WordCount { private static final Pattern SPACE = Pattern.compile(" "); public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount "); System.exit(1); } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount") .setMaster("local"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD lines = ctx.textFile(args[0], 1); JavaRDD words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator()); JavaPairRDD wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1)); JavaPairRDD wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2); List> output = wordWithCount.collect(); for (Tuple2 tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } ctx.stop(); } }