From 8e190c76af721cdae529b4a6d01f8dc9d8666379 Mon Sep 17 00:00:00 2001 From: Kumar Chandrakant Date: Sat, 24 Aug 2019 02:02:02 +0530 Subject: [PATCH] Adding source code for the tutorial tracked under BAEL-3203. (#7600) --- apache-spark/data/iris.data | 150 ++++++++++++++++++ .../logistic-regression/data/._SUCCESS.crc | Bin 0 -> 8 bytes ...-80d9-8e1edf399601-c000.snappy.parquet.crc | Bin 0 -> 24 bytes .../model/logistic-regression/data/_SUCCESS | 0 ...41ff-80d9-8e1edf399601-c000.snappy.parquet | Bin 0 -> 1789 bytes .../metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../metadata/.part-00000.crc | Bin 0 -> 12 bytes .../logistic-regression/metadata/_SUCCESS | 0 .../logistic-regression/metadata/part-00000 | 1 + apache-spark/pom.xml | 132 +++++++-------- .../com/baeldung/ml/MachineLearningApp.java | 111 +++++++++++++ 11 files changed, 332 insertions(+), 62 deletions(-) create mode 100644 apache-spark/data/iris.data create mode 100644 apache-spark/model/logistic-regression/data/._SUCCESS.crc create mode 100644 apache-spark/model/logistic-regression/data/.part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet.crc create mode 100644 apache-spark/model/logistic-regression/data/_SUCCESS create mode 100644 apache-spark/model/logistic-regression/data/part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet create mode 100644 apache-spark/model/logistic-regression/metadata/._SUCCESS.crc create mode 100644 apache-spark/model/logistic-regression/metadata/.part-00000.crc create mode 100644 apache-spark/model/logistic-regression/metadata/_SUCCESS create mode 100644 apache-spark/model/logistic-regression/metadata/part-00000 create mode 100644 apache-spark/src/main/java/com/baeldung/ml/MachineLearningApp.java diff --git a/apache-spark/data/iris.data b/apache-spark/data/iris.data new file mode 100644 index 0000000000..396653cc98 --- /dev/null +++ b/apache-spark/data/iris.data @@ -0,0 +1,150 @@ +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/apache-spark/model/logistic-regression/data/._SUCCESS.crc b/apache-spark/model/logistic-regression/data/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/apache-spark/model/logistic-regression/data/.part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet.crc b/apache-spark/model/logistic-regression/data/.part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..46311024cff2ce7756c0f80479f5268efd069597 GIT binary patch literal 24 gcmYc;N@ieSU}Bj6=FX}-{Q+{WxhI4AG>Jn!?q=Q-)j)TKNU z&>&KCNPALPhCV&d1-K9e6nmU9)0k9=4AQg$b9Kj+`QqM3#5Tj zLMTXT@(!`;3!aN)0ZBtJWUtdAL;KlAQsb_*NO+cIS1psca3p|=t8r?%9+G@Wl2Fnh zw#O0^w=r#Rkk=2?VIE?;9hU78$0RLJfW4&d@D?26E;MOXP^45kMYD1mL>FWskvbfx zSyl9eacIU|*Z`G6>BI%{lzX6a&HD}X zbA~g@boQyZo?;a!`qxr)C$gSlK=~;uzs2mTII)S_qSC&^IIk?EB6H(945gO+5(AFF zA-xs&AjgT>CjhRXt*IdtOo1T`_Y8IhYR~x^&q}jN^LR}j|%+C!m?J5J9 z6uXeG{ZqCtTSGZ{^NT1~b!=hUVHB4QS(}M1`@ulVET-YTuVA!I@Za|$lYw$3k&EZ`)h76%z>^Qgsue&$V z?y=GPjB2T1`8X*Gu|v=CIMWXfsrBZQtj1 - 4.0.0 - com.baeldung - apache-spark - 1.0-SNAPSHOT - apache-spark - jar - http://maven.apache.org + + 4.0.0 + com.baeldung + apache-spark + 1.0-SNAPSHOT + apache-spark + jar + http://maven.apache.org - - com.baeldung - parent-modules - 1.0.0-SNAPSHOT - + + com.baeldung + parent-modules + 1.0.0-SNAPSHOT + - - - org.apache.spark - spark-core_2.11 - ${org.apache.spark.spark-core.version} - provided - + - org.apache.spark - spark-sql_2.11 - ${org.apache.spark.spark-sql.version} - provided + org.apache.spark + spark-core_2.11 + ${org.apache.spark.spark-core.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${org.apache.spark.spark-sql.version} + provided org.apache.spark @@ -33,6 +34,12 @@ ${org.apache.spark.spark-streaming.version} provided + + org.apache.spark + spark-mllib_2.11 + ${org.apache.spark.spark-mllib.version} + provided + org.apache.spark spark-streaming-kafka-0-10_2.11 @@ -48,46 +55,47 @@ spark-cassandra-connector-java_2.11 ${com.datastax.spark.spark-cassandra-connector-java.version} - + - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${java.version} - ${java.version} - - - - maven-assembly-plugin - - - package - - single - - - - - - jar-with-dependencies - - - - - + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven-compiler-plugin.version} + + ${java.version} + ${java.version} + + + + maven-assembly-plugin + + + package + + single + + + + + + jar-with-dependencies + + + + + - - 2.3.0 - 2.3.0 - 2.3.0 - 2.3.0 - 2.3.0 - 1.5.2 + + 2.3.0 + 2.3.0 + 2.3.0 + 2.3.0 + 2.3.0 + 2.3.0 + 1.5.2 3.2 - + diff --git a/apache-spark/src/main/java/com/baeldung/ml/MachineLearningApp.java b/apache-spark/src/main/java/com/baeldung/ml/MachineLearningApp.java new file mode 100644 index 0000000000..6094683031 --- /dev/null +++ b/apache-spark/src/main/java/com/baeldung/ml/MachineLearningApp.java @@ -0,0 +1,111 @@ +package com.baeldung.ml; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.classification.LogisticRegressionModel; +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; +import org.apache.spark.mllib.evaluation.MulticlassMetrics; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; + +import scala.Tuple2; + +public class MachineLearningApp { + + public static void main(String[] args) { + + // 1. Setting the Spark Context + SparkConf conf = new SparkConf().setAppName("Main") + .setMaster("local[2]") + .set("spark.executor.memory", "3g") + .set("spark.driver.memory", "3g"); + JavaSparkContext sc = new JavaSparkContext(conf); + Logger.getLogger("org") + .setLevel(Level.OFF); + Logger.getLogger("akka") + .setLevel(Level.OFF); + + // 2. Loading the Data-set + String dataFile = "data\\iris.data"; + JavaRDD data = sc.textFile(dataFile); + + // 3. Exploratory Data Analysis + // 3.1. Creating Vector of Input Data + JavaRDD inputData = data.map(line -> { + String[] parts = line.split(","); + double[] v = new double[parts.length - 1]; + for (int i = 0; i < parts.length - 1; i++) { + v[i] = Double.parseDouble(parts[i]); + } + return Vectors.dense(v); + }); + // 3.2. Performing Statistical Analysis + MultivariateStatisticalSummary summary = Statistics.colStats(inputData.rdd()); + System.out.println("Summary Mean:"); + System.out.println(summary.mean()); + System.out.println("Summary Variance:"); + System.out.println(summary.variance()); + System.out.println("Summary Non-zero:"); + System.out.println(summary.numNonzeros()); + // 3.3. Performing Correlation Analysis + Matrix correlMatrix = Statistics.corr(inputData.rdd(), "pearson"); + System.out.println("Correlation Matrix:"); + System.out.println(correlMatrix.toString()); + + // 4. Data Preparation + // 4.1. Creating Map for Textual Output Labels + Map map = new HashMap(); + map.put("Iris-setosa", 0); + map.put("Iris-versicolor", 1); + map.put("Iris-virginica", 2); + // 4.2. Creating LabeledPoint of Input and Output Data + JavaRDD parsedData = data.map(line -> { + String[] parts = line.split(","); + double[] v = new double[parts.length - 1]; + for (int i = 0; i < parts.length - 1; i++) { + v[i] = Double.parseDouble(parts[i]); + } + return new LabeledPoint(map.get(parts[parts.length - 1]), Vectors.dense(v)); + }); + + // 5. Data Splitting into 80% Training and 20% Test Sets + JavaRDD[] splits = parsedData.randomSplit(new double[] { 0.8, 0.2 }, 11L); + JavaRDD trainingData = splits[0].cache(); + JavaRDD testData = splits[1]; + + // 6. Modeling + // 6.1. Model Training + LogisticRegressionModel model = new LogisticRegressionWithLBFGS().setNumClasses(3) + .run(trainingData.rdd()); + // 6.2. Model Evaluation + JavaPairRDD predictionAndLabels = testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); + MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); + double accuracy = metrics.accuracy(); + System.out.println("Model Accuracy on Test Data: " + accuracy); + + // 7. Model Saving and Loading + // 7.1. Model Saving + model.save(sc.sc(), "model\\logistic-regression"); + // 7.2. Model Loading + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc.sc(), "model\\logistic-regression"); + // 7.3. Prediction on New Data + Vector newData = Vectors.dense(new double[] { 1, 1, 1, 1 }); + double prediction = sameModel.predict(newData); + System.out.println("Model Prediction on New Data = " + prediction); + + // 8. Clean-up + sc.close(); + } + +}