Merge pull request #6974 from Doha2012/master

remove stopwords from string
This commit is contained in:
Loredana Crusoveanu 2019-05-17 20:53:44 +03:00 committed by GitHub
commit a9a55bbd59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 5182 additions and 55 deletions

View File

@ -15,28 +15,6 @@
</parent>
<dependencies>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons-io.version}</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>${commons-codec.version}</version>
</dependency>
<!-- test scoped -->
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>${assertj.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-core</artifactId>
@ -57,11 +35,6 @@
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>com.vdurmont</groupId>
<artifactId>emoji-java</artifactId>
<version>${emoji-java.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
@ -73,38 +46,18 @@
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>${junit-jupiter-api.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-library</artifactId>
<version>${org.hamcrest.version}</version>
<scope>test</scope>
</dependency>
<!-- Added for password generation -->
<dependency>
<groupId>org.passay</groupId>
<artifactId>passay</artifactId>
<version>${passay.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>${commons-text.version}</version>
</dependency>
<dependency>
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>${ahocorasick.version}</version>
</dependency>
</dependencies>
<build>
@ -131,18 +84,10 @@
</build>
<properties>
<!-- util -->
<commons-lang3.version>3.8.1</commons-lang3.version>
<commons-codec.version>1.10</commons-codec.version>
<!-- testing -->
<assertj.version>3.6.1</assertj.version>
<icu4j.version>61.1</icu4j.version>
<guava.version>27.0.1-jre</guava.version>
<emoji-java.version>4.0.0</emoji-java.version>
<junit-jupiter-api.version>5.3.1</junit-jupiter-api.version>
<passay.version>1.3.1</passay.version>
<commons-text.version>1.4</commons-text.version>
<ahocorasick.version>0.4.0</ahocorasick.version>
</properties>
</project>

View File

@ -0,0 +1,73 @@
package com.baeldung.string.performance;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
@Fork(value = 3, warmups = 1)
@State(Scope.Benchmark)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public class RemovingStopwordsPerformanceComparison {
private String data;
private List<String> stopwords;
private String stopwordsRegex;
public static void main(String[] args) throws Exception {
org.openjdk.jmh.Main.main(args);
}
@Setup
public void setup() throws IOException {
data = new String(Files.readAllBytes(Paths.get("src/main/resources/shakespeare-hamlet.txt")));
data = data.toLowerCase();
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
stopwordsRegex = stopwords.stream().collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
}
@Benchmark
public String removeManually() {
String[] allWords = data.split(" ");
StringBuilder builder = new StringBuilder();
for(String word:allWords) {
if(! stopwords.contains(word)) {
builder.append(word);
builder.append(' ');
}
}
return builder.toString().trim();
}
@Benchmark
public String removeAll() {
ArrayList<String> allWords = Stream.of(data.split(" "))
.collect(Collectors.toCollection(ArrayList<String>::new));
allWords.removeAll(stopwords);
return allWords.stream().collect(Collectors.joining(" "));
}
@Benchmark
public String replaceRegex() {
return data.replaceAll(stopwordsRegex, "");
}
}

View File

@ -0,0 +1,127 @@
i
me
my
myself
we
our
ours
ourselves
you
your
yours
yourself
yourselves
he
him
his
himself
she
her
hers
herself
it
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
should
now

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,60 @@
package com.baeldung.string;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.junit.BeforeClass;
import org.junit.Test;
public class RemoveStopwordsUnitTest {
final String original = "The quick brown fox jumps over the lazy dog";
final String target = "quick brown fox jumps lazy dog";
static List<String> stopwords;
@BeforeClass
public static void loadStopwords() throws IOException {
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
}
@Test
public void whenRemoveStopwordsManually_thenSuccess() {
String[] allWords = original.toLowerCase()
.split(" ");
StringBuilder builder = new StringBuilder();
for (String word : allWords) {
if (!stopwords.contains(word)) {
builder.append(word);
builder.append(' ');
}
}
String result = builder.toString().trim();
assertEquals(result, target);
}
@Test
public void whenRemoveStopwordsUsingRemoveAll_thenSuccess() {
ArrayList<String> allWords = Stream.of(original.toLowerCase()
.split(" "))
.collect(Collectors.toCollection(ArrayList<String>::new));
allWords.removeAll(stopwords);
String result = allWords.stream().collect(Collectors.joining(" "));
assertEquals(result, target);
}
@Test
public void whenRemoveStopwordsUsingRegex_thenSuccess() {
String stopwordsRegex = stopwords.stream()
.collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
String result = original.toLowerCase().replaceAll(stopwordsRegex, "");
assertEquals(result, target);
}
}