remove stop words from string
This commit is contained in:
parent
8754806e7e
commit
a4762fcc90
|
@ -15,28 +15,6 @@
|
|||
</parent>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>${commons-io.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
<version>${log4j.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-codec</groupId>
|
||||
<artifactId>commons-codec</artifactId>
|
||||
<version>${commons-codec.version}</version>
|
||||
</dependency>
|
||||
<!-- test scoped -->
|
||||
<dependency>
|
||||
<groupId>org.assertj</groupId>
|
||||
<artifactId>assertj-core</artifactId>
|
||||
<version>${assertj.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.openjdk.jmh</groupId>
|
||||
<artifactId>jmh-core</artifactId>
|
||||
|
@ -57,11 +35,6 @@
|
|||
<artifactId>guava</artifactId>
|
||||
<version>${guava.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.vdurmont</groupId>
|
||||
<artifactId>emoji-java</artifactId>
|
||||
<version>${emoji-java.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
|
@ -73,38 +46,18 @@
|
|||
<version>${junit.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<version>${junit-jupiter-api.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.hamcrest</groupId>
|
||||
<artifactId>hamcrest-library</artifactId>
|
||||
<version>${org.hamcrest.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Added for password generation -->
|
||||
<dependency>
|
||||
<groupId>org.passay</groupId>
|
||||
<artifactId>passay</artifactId>
|
||||
<version>${passay.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-text</artifactId>
|
||||
<version>${commons-text.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.ahocorasick</groupId>
|
||||
<artifactId>ahocorasick</artifactId>
|
||||
<version>${ahocorasick.version}</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
@ -131,18 +84,10 @@
|
|||
</build>
|
||||
|
||||
<properties>
|
||||
<!-- util -->
|
||||
<commons-lang3.version>3.8.1</commons-lang3.version>
|
||||
<commons-codec.version>1.10</commons-codec.version>
|
||||
<!-- testing -->
|
||||
<assertj.version>3.6.1</assertj.version>
|
||||
<icu4j.version>61.1</icu4j.version>
|
||||
<guava.version>27.0.1-jre</guava.version>
|
||||
<emoji-java.version>4.0.0</emoji-java.version>
|
||||
<junit-jupiter-api.version>5.3.1</junit-jupiter-api.version>
|
||||
<passay.version>1.3.1</passay.version>
|
||||
<commons-text.version>1.4</commons-text.version>
|
||||
<ahocorasick.version>0.4.0</ahocorasick.version>
|
||||
</properties>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,73 @@
|
|||
package com.baeldung.string.performance;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
|
||||
|
||||
@Fork(value = 3, warmups = 1)
|
||||
@State(Scope.Benchmark)
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
public class RemovingStopwordsPerformanceComparison {
|
||||
|
||||
private String data;
|
||||
|
||||
private List<String> stopwords;
|
||||
|
||||
private String stopwordsRegex;
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
org.openjdk.jmh.Main.main(args);
|
||||
}
|
||||
|
||||
@Setup
|
||||
public void setup() throws IOException {
|
||||
data = new String(Files.readAllBytes(Paths.get("src/main/resources/shakespeare-hamlet.txt")));
|
||||
data = data.toLowerCase();
|
||||
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
|
||||
stopwordsRegex = stopwords.stream().collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public String removeManually() {
|
||||
String[] allWords = data.split(" ");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for(String word:allWords) {
|
||||
if(! stopwords.contains(word)) {
|
||||
builder.append(word);
|
||||
builder.append(' ');
|
||||
}
|
||||
}
|
||||
return builder.toString().trim();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public String removeAll() {
|
||||
ArrayList<String> allWords = Stream.of(data.split(" "))
|
||||
.collect(Collectors.toCollection(ArrayList<String>::new));
|
||||
allWords.removeAll(stopwords);
|
||||
return allWords.stream().collect(Collectors.joining(" "));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public String replaceRegex() {
|
||||
return data.replaceAll(stopwordsRegex, "");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
i
|
||||
me
|
||||
my
|
||||
myself
|
||||
we
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
you
|
||||
your
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
||||
he
|
||||
him
|
||||
his
|
||||
himself
|
||||
she
|
||||
her
|
||||
hers
|
||||
herself
|
||||
it
|
||||
its
|
||||
itself
|
||||
they
|
||||
them
|
||||
their
|
||||
theirs
|
||||
themselves
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
am
|
||||
is
|
||||
are
|
||||
was
|
||||
were
|
||||
be
|
||||
been
|
||||
being
|
||||
have
|
||||
has
|
||||
had
|
||||
having
|
||||
do
|
||||
does
|
||||
did
|
||||
doing
|
||||
a
|
||||
an
|
||||
the
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
s
|
||||
t
|
||||
can
|
||||
will
|
||||
just
|
||||
don
|
||||
should
|
||||
now
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,60 @@
|
|||
package com.baeldung.string;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class RemoveStopwordsUnitTest {
|
||||
final String original = "The quick brown fox jumps over the lazy dog";
|
||||
final String target = "quick brown fox jumps lazy dog";
|
||||
static List<String> stopwords;
|
||||
|
||||
@BeforeClass
|
||||
public static void loadStopwords() throws IOException {
|
||||
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenRemoveStopwordsManually_thenSuccess() {
|
||||
String[] allWords = original.toLowerCase()
|
||||
.split(" ");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (String word : allWords) {
|
||||
if (!stopwords.contains(word)) {
|
||||
builder.append(word);
|
||||
builder.append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
String result = builder.toString().trim();
|
||||
assertEquals(result, target);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenRemoveStopwordsUsingRemoveAll_thenSuccess() {
|
||||
ArrayList<String> allWords = Stream.of(original.toLowerCase()
|
||||
.split(" "))
|
||||
.collect(Collectors.toCollection(ArrayList<String>::new));
|
||||
allWords.removeAll(stopwords);
|
||||
String result = allWords.stream().collect(Collectors.joining(" "));
|
||||
assertEquals(result, target);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenRemoveStopwordsUsingRegex_thenSuccess() {
|
||||
String stopwordsRegex = stopwords.stream()
|
||||
.collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
|
||||
String result = original.toLowerCase().replaceAll(stopwordsRegex, "");
|
||||
assertEquals(result, target);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue