init
This commit is contained in:
parent
f81a203765
commit
46765cf410
|
@ -0,0 +1 @@
|
||||||
|
/build/
|
|
@ -0,0 +1,44 @@
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>com.baeldung</groupId>
|
||||||
|
<artifactId>parent-modules</artifactId>
|
||||||
|
<version>1.0.0-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<groupId>com.baeldung.apache</groupId>
|
||||||
|
<artifactId>apache-beam</artifactId>
|
||||||
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
|
<name>apache-beam</name>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.beam</groupId>
|
||||||
|
<artifactId>beam-sdks-java-core</artifactId>
|
||||||
|
<version>${beam.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<!-- runtime scoped -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.beam</groupId>
|
||||||
|
<artifactId>beam-runners-direct-java</artifactId>
|
||||||
|
<version>${beam.version}</version>
|
||||||
|
<scope>runtime</scope>
|
||||||
|
</dependency>
|
||||||
|
<!-- test scoped -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.assertj</groupId>
|
||||||
|
<artifactId>assertj-core</artifactId>
|
||||||
|
<version>${assertj.version}</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<beam.version>2.19.0</beam.version>
|
||||||
|
<assertj.version>3.6.1</assertj.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,71 @@
|
||||||
|
package com.baeldung.apache.beam;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.beam.sdk.Pipeline;
|
||||||
|
import org.apache.beam.sdk.io.TextIO;
|
||||||
|
import org.apache.beam.sdk.options.PipelineOptions;
|
||||||
|
import org.apache.beam.sdk.options.PipelineOptionsFactory;
|
||||||
|
import org.apache.beam.sdk.transforms.Count;
|
||||||
|
import org.apache.beam.sdk.transforms.Filter;
|
||||||
|
import org.apache.beam.sdk.transforms.FlatMapElements;
|
||||||
|
import org.apache.beam.sdk.transforms.MapElements;
|
||||||
|
import org.apache.beam.sdk.values.KV;
|
||||||
|
import org.apache.beam.sdk.values.PCollection;
|
||||||
|
import org.apache.beam.sdk.values.TypeDescriptors;
|
||||||
|
|
||||||
|
public class WordCount {
|
||||||
|
|
||||||
|
public static boolean wordCount(String inputFilePath, String outputFilePath) {
|
||||||
|
// We use default options
|
||||||
|
PipelineOptions options = PipelineOptionsFactory.create();
|
||||||
|
// to create the pipeline
|
||||||
|
Pipeline p = Pipeline.create(options);
|
||||||
|
// Here is our workflow graph
|
||||||
|
PCollection<KV<String, Long>> wordCount = p
|
||||||
|
.apply("(1) Read all lines", TextIO.read().from(inputFilePath))
|
||||||
|
.apply("(2) Flatmap to a list of words", FlatMapElements.into(TypeDescriptors.strings())
|
||||||
|
.via(line -> Arrays.asList(line.split("\\s"))))
|
||||||
|
.apply("(3) Lowercase all", MapElements.into(TypeDescriptors.strings())
|
||||||
|
.via(word -> word.toLowerCase()))
|
||||||
|
.apply("(4) Trim punctuations", MapElements.into(TypeDescriptors.strings())
|
||||||
|
.via(word -> trim(word)))
|
||||||
|
.apply("(5) Filter stopwords", Filter.by(word -> !isStopWord(word)))
|
||||||
|
.apply("(6) Count words", Count.perElement());
|
||||||
|
// We convert the PCollection to String so that we can write it to file
|
||||||
|
wordCount.apply(MapElements.into(TypeDescriptors.strings())
|
||||||
|
.via(count -> count.getKey() + " --> " + count.getValue()))
|
||||||
|
.apply(TextIO.write().to(outputFilePath));
|
||||||
|
// Finally we must run the pipeline, otherwise it's only a definition
|
||||||
|
p.run().waitUntilFinish();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isStopWord(String word) {
|
||||||
|
String[] stopwords = {"am", "are", "is", "i", "you", "me",
|
||||||
|
"he", "she", "they", "them", "was",
|
||||||
|
"were", "from", "in", "of", "to", "be",
|
||||||
|
"him", "her", "us", "and", "or"};
|
||||||
|
for (String stopword : stopwords) {
|
||||||
|
if (stopword.compareTo(word) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String trim(String word) {
|
||||||
|
return word.replace("(","")
|
||||||
|
.replace(")", "")
|
||||||
|
.replace(",", "")
|
||||||
|
.replace(".", "")
|
||||||
|
.replace("\"", "")
|
||||||
|
.replace("'", "")
|
||||||
|
.replace(":", "")
|
||||||
|
.replace(";", "")
|
||||||
|
.replace("-", "")
|
||||||
|
.replace("?", "")
|
||||||
|
.replace("!", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
package com.baeldung.apache.beam;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class WordCountUnitTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
public void givenInputFile_whenWordCountRuns_thenJobFinishWithoutError() {
|
||||||
|
boolean jobDone = WordCount.wordCount("src/test/resources/wordcount.txt", "target/output");
|
||||||
|
assertTrue(jobDone);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,16 @@
|
||||||
|
We've all heard the scare stories about North Korea: the homemade nuclear arsenal built while their people starve and then aimed imprecisely at the rest of the world, a
|
||||||
|
leader so deluded he makes L Ron Hubbard look like a man excessively overburdened with self-doubt and their deep-seated belief that foreign capitalists will invade at any
|
||||||
|
moment and steal all their bauxite.
|
||||||
|
The popular portrayal of this Marxist nation is something like one of the more harrowing episodes of M*A*S*H, only with the cast of wacky characters replaced by twitchy,
|
||||||
|
heavily armed Stalinist meth addicts
|
||||||
|
Cracked would like to take a moment to celebrate the good things about North Korea though, the things that the country's enemies prefer to suppress as part of their politically
|
||||||
|
motivated jealousy. Like how no different to you and me, there's nothing every North Korean likes more after an 18 hour shift at the phosphorus plant than a nice beer to go with
|
||||||
|
his dried fish ration. Ever attentive to its people's needs and in the twinkling of a decade, North Korea's leadership bought, disassembled, transported and rebuilt a British
|
||||||
|
brewery in order to discover and reproduce the secrets of beer and then brew the sweet nectar for its hardworking people, up to 18 bottles at a time. And with minimal fatalities.
|
||||||
|
When was the last time YOUR leader got a beer for YOU, American? (NB do not answer this question if you are Henry Louis Gates).
|
||||||
|
Or how about the fried chicken restaurant that downtown Pyongyang boasts? Yes real chicken, fried and then delivered to your sleeping cube, with optional beer if you like! You
|
||||||
|
don't even have to remove the feathers or pull out the gizzard yourself. Mostly. Americans must eat their fried chicken from a bucket, like swine, sold by a company so secretive
|
||||||
|
that even the very blend of seasoning used is intentionally kept from them. And they call North Korea paranoid?
|
||||||
|
And how many nations would entertain the syphilitic, bourgeois ramblings of Bill Clinton let alone permit him anywhere near their proud womenfolk? Only wise Kim Jong Il could see
|
||||||
|
past Bill's many, many imperfections and treat him with the pity and kindness he deserves, accepting his feeble pleas to pardon the American spies rightly convicted of photographing
|
||||||
|
the nation's sensitive beetroot fields.
|
Loading…
Reference in New Issue