remove stop words from string
This commit is contained in:
parent
8754806e7e
commit
a4762fcc90
|
@ -15,28 +15,6 @@
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
|
||||||
<groupId>commons-io</groupId>
|
|
||||||
<artifactId>commons-io</artifactId>
|
|
||||||
<version>${commons-io.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>log4j</groupId>
|
|
||||||
<artifactId>log4j</artifactId>
|
|
||||||
<version>${log4j.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>commons-codec</groupId>
|
|
||||||
<artifactId>commons-codec</artifactId>
|
|
||||||
<version>${commons-codec.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<!-- test scoped -->
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.assertj</groupId>
|
|
||||||
<artifactId>assertj-core</artifactId>
|
|
||||||
<version>${assertj.version}</version>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.openjdk.jmh</groupId>
|
<groupId>org.openjdk.jmh</groupId>
|
||||||
<artifactId>jmh-core</artifactId>
|
<artifactId>jmh-core</artifactId>
|
||||||
|
@ -57,11 +35,6 @@
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
<version>${guava.version}</version>
|
<version>${guava.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.vdurmont</groupId>
|
|
||||||
<artifactId>emoji-java</artifactId>
|
|
||||||
<version>${emoji-java.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -73,38 +46,18 @@
|
||||||
<version>${junit.version}</version>
|
<version>${junit.version}</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.junit.jupiter</groupId>
|
|
||||||
<artifactId>junit-jupiter-api</artifactId>
|
|
||||||
<version>${junit-jupiter-api.version}</version>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.hamcrest</groupId>
|
<groupId>org.hamcrest</groupId>
|
||||||
<artifactId>hamcrest-library</artifactId>
|
<artifactId>hamcrest-library</artifactId>
|
||||||
<version>${org.hamcrest.version}</version>
|
<version>${org.hamcrest.version}</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- Added for password generation -->
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.passay</groupId>
|
|
||||||
<artifactId>passay</artifactId>
|
|
||||||
<version>${passay.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-text</artifactId>
|
<artifactId>commons-text</artifactId>
|
||||||
<version>${commons-text.version}</version>
|
<version>${commons-text.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.ahocorasick</groupId>
|
|
||||||
<artifactId>ahocorasick</artifactId>
|
|
||||||
<version>${ahocorasick.version}</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
@ -131,18 +84,10 @@
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<!-- util -->
|
|
||||||
<commons-lang3.version>3.8.1</commons-lang3.version>
|
<commons-lang3.version>3.8.1</commons-lang3.version>
|
||||||
<commons-codec.version>1.10</commons-codec.version>
|
|
||||||
<!-- testing -->
|
|
||||||
<assertj.version>3.6.1</assertj.version>
|
|
||||||
<icu4j.version>61.1</icu4j.version>
|
<icu4j.version>61.1</icu4j.version>
|
||||||
<guava.version>27.0.1-jre</guava.version>
|
<guava.version>27.0.1-jre</guava.version>
|
||||||
<emoji-java.version>4.0.0</emoji-java.version>
|
|
||||||
<junit-jupiter-api.version>5.3.1</junit-jupiter-api.version>
|
|
||||||
<passay.version>1.3.1</passay.version>
|
|
||||||
<commons-text.version>1.4</commons-text.version>
|
<commons-text.version>1.4</commons-text.version>
|
||||||
<ahocorasick.version>0.4.0</ahocorasick.version>
|
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -0,0 +1,73 @@
|
||||||
|
package com.baeldung.string.performance;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.Fork;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Scope;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.State;
|
||||||
|
|
||||||
|
|
||||||
|
@Fork(value = 3, warmups = 1)
|
||||||
|
@State(Scope.Benchmark)
|
||||||
|
@BenchmarkMode(Mode.AverageTime)
|
||||||
|
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||||
|
public class RemovingStopwordsPerformanceComparison {
|
||||||
|
|
||||||
|
private String data;
|
||||||
|
|
||||||
|
private List<String> stopwords;
|
||||||
|
|
||||||
|
private String stopwordsRegex;
|
||||||
|
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
org.openjdk.jmh.Main.main(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Setup
|
||||||
|
public void setup() throws IOException {
|
||||||
|
data = new String(Files.readAllBytes(Paths.get("src/main/resources/shakespeare-hamlet.txt")));
|
||||||
|
data = data.toLowerCase();
|
||||||
|
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
|
||||||
|
stopwordsRegex = stopwords.stream().collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public String removeManually() {
|
||||||
|
String[] allWords = data.split(" ");
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
for(String word:allWords) {
|
||||||
|
if(! stopwords.contains(word)) {
|
||||||
|
builder.append(word);
|
||||||
|
builder.append(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return builder.toString().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public String removeAll() {
|
||||||
|
ArrayList<String> allWords = Stream.of(data.split(" "))
|
||||||
|
.collect(Collectors.toCollection(ArrayList<String>::new));
|
||||||
|
allWords.removeAll(stopwords);
|
||||||
|
return allWords.stream().collect(Collectors.joining(" "));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public String replaceRegex() {
|
||||||
|
return data.replaceAll(stopwordsRegex, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,127 @@
|
||||||
|
i
|
||||||
|
me
|
||||||
|
my
|
||||||
|
myself
|
||||||
|
we
|
||||||
|
our
|
||||||
|
ours
|
||||||
|
ourselves
|
||||||
|
you
|
||||||
|
your
|
||||||
|
yours
|
||||||
|
yourself
|
||||||
|
yourselves
|
||||||
|
he
|
||||||
|
him
|
||||||
|
his
|
||||||
|
himself
|
||||||
|
she
|
||||||
|
her
|
||||||
|
hers
|
||||||
|
herself
|
||||||
|
it
|
||||||
|
its
|
||||||
|
itself
|
||||||
|
they
|
||||||
|
them
|
||||||
|
their
|
||||||
|
theirs
|
||||||
|
themselves
|
||||||
|
what
|
||||||
|
which
|
||||||
|
who
|
||||||
|
whom
|
||||||
|
this
|
||||||
|
that
|
||||||
|
these
|
||||||
|
those
|
||||||
|
am
|
||||||
|
is
|
||||||
|
are
|
||||||
|
was
|
||||||
|
were
|
||||||
|
be
|
||||||
|
been
|
||||||
|
being
|
||||||
|
have
|
||||||
|
has
|
||||||
|
had
|
||||||
|
having
|
||||||
|
do
|
||||||
|
does
|
||||||
|
did
|
||||||
|
doing
|
||||||
|
a
|
||||||
|
an
|
||||||
|
the
|
||||||
|
and
|
||||||
|
but
|
||||||
|
if
|
||||||
|
or
|
||||||
|
because
|
||||||
|
as
|
||||||
|
until
|
||||||
|
while
|
||||||
|
of
|
||||||
|
at
|
||||||
|
by
|
||||||
|
for
|
||||||
|
with
|
||||||
|
about
|
||||||
|
against
|
||||||
|
between
|
||||||
|
into
|
||||||
|
through
|
||||||
|
during
|
||||||
|
before
|
||||||
|
after
|
||||||
|
above
|
||||||
|
below
|
||||||
|
to
|
||||||
|
from
|
||||||
|
up
|
||||||
|
down
|
||||||
|
in
|
||||||
|
out
|
||||||
|
on
|
||||||
|
off
|
||||||
|
over
|
||||||
|
under
|
||||||
|
again
|
||||||
|
further
|
||||||
|
then
|
||||||
|
once
|
||||||
|
here
|
||||||
|
there
|
||||||
|
when
|
||||||
|
where
|
||||||
|
why
|
||||||
|
how
|
||||||
|
all
|
||||||
|
any
|
||||||
|
both
|
||||||
|
each
|
||||||
|
few
|
||||||
|
more
|
||||||
|
most
|
||||||
|
other
|
||||||
|
some
|
||||||
|
such
|
||||||
|
no
|
||||||
|
nor
|
||||||
|
not
|
||||||
|
only
|
||||||
|
own
|
||||||
|
same
|
||||||
|
so
|
||||||
|
than
|
||||||
|
too
|
||||||
|
very
|
||||||
|
s
|
||||||
|
t
|
||||||
|
can
|
||||||
|
will
|
||||||
|
just
|
||||||
|
don
|
||||||
|
should
|
||||||
|
now
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,60 @@
|
||||||
|
package com.baeldung.string;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class RemoveStopwordsUnitTest {
|
||||||
|
final String original = "The quick brown fox jumps over the lazy dog";
|
||||||
|
final String target = "quick brown fox jumps lazy dog";
|
||||||
|
static List<String> stopwords;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void loadStopwords() throws IOException {
|
||||||
|
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenRemoveStopwordsManually_thenSuccess() {
|
||||||
|
String[] allWords = original.toLowerCase()
|
||||||
|
.split(" ");
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
for (String word : allWords) {
|
||||||
|
if (!stopwords.contains(word)) {
|
||||||
|
builder.append(word);
|
||||||
|
builder.append(' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String result = builder.toString().trim();
|
||||||
|
assertEquals(result, target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenRemoveStopwordsUsingRemoveAll_thenSuccess() {
|
||||||
|
ArrayList<String> allWords = Stream.of(original.toLowerCase()
|
||||||
|
.split(" "))
|
||||||
|
.collect(Collectors.toCollection(ArrayList<String>::new));
|
||||||
|
allWords.removeAll(stopwords);
|
||||||
|
String result = allWords.stream().collect(Collectors.joining(" "));
|
||||||
|
assertEquals(result, target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whenRemoveStopwordsUsingRegex_thenSuccess() {
|
||||||
|
String stopwordsRegex = stopwords.stream()
|
||||||
|
.collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
|
||||||
|
String result = original.toLowerCase().replaceAll(stopwordsRegex, "");
|
||||||
|
assertEquals(result, target);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue