Merge pull request #3909 from mansi2392/master

BAEL-1557 Intro to Apache OpenNlp
This commit is contained in:
Tom Hombergs 2018-04-05 23:26:48 +02:00 committed by GitHub
commit b20a3a1097
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 301765 additions and 0 deletions

32
apache-opennlp/pom.xml Normal file
View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<artifactId>apache-opennlp</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>1.8.4</version>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>3.9.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,32 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class ChunkerTest {
@Test
public void givenChunkerModel_whenChunk_thenChunksAreDetected() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
ChunkerME chunker = new ChunkerME(chunkerModel);
String[] chunks = chunker.chunk(tokens, tags);
assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
}
}

View File

@ -0,0 +1,41 @@
package com.baeldung.apache.opennlp;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import opennlp.tools.langdetect.Language;
import opennlp.tools.langdetect.LanguageDetector;
import opennlp.tools.langdetect.LanguageDetectorFactory;
import opennlp.tools.langdetect.LanguageDetectorME;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.langdetect.LanguageDetectorSampleStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class LanguageDetectorAndTrainingDataTest {
@Test
public void givenLanguageDictionary_whenLanguageDetect_thenLanguageIsDetected() throws FileNotFoundException, IOException {
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
params.put(TrainingParameters.CUTOFF_PARAM, 5);
params.put("DataIndexer", "TwoPass");
params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());
LanguageDetector ld = new LanguageDetectorME(model);
Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
assertThat(Arrays.asList(languages).toString()).contains("pob (0.9999999950605625)", "ita (4.939427661577956E-9)", "spa (9.665954064665144E-15)",
"fra (8.250349924885834E-25)");
}
}

View File

@ -0,0 +1,29 @@
package com.baeldung.apache.opennlp;
import java.io.InputStream;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class LemmetizerTest {
@Test
public void givenEnglishDictionary_whenLemmatize_thenLemmasAreDetected() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
InputStream dictLemmatizer = getClass().getResourceAsStream("/models/en-lemmatizer.dict");
DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
String[] lemmas = lemmatizer.lemmatize(tokens, tags);
assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
}
}

View File

@ -0,0 +1,39 @@
package com.baeldung.apache.opennlp;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.Span;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class NamedEntityRecognitionTest {
@Test
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
NameFinderME nameFinderME = new NameFinderME(model);
List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
List<String> names = new ArrayList<String>();
int k = 0;
for (Span s : spans) {
names.add("");
for (int index = s.getStart(); index < s.getEnd(); index++) {
names.set(k, names.get(k) + tokens[index]);
}
k++;
}
assertThat(names).contains("John","Leonard","Penny");
}
}

View File

@ -0,0 +1,24 @@
package com.baeldung.apache.opennlp;
import java.io.InputStream;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class POSTaggerTest {
@Test
public void givenPOSModel_whenPOSTagging_thenPOSAreDetected() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
}
}

View File

@ -0,0 +1,28 @@
package com.baeldung.apache.opennlp;
import java.io.InputStream;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class SentenceDetectionTest {
@Test
public void givenEnglishModel_whenDetect_thenSentencesAreDetected() throws Exception {
String paragraph = "This is a statement. This is another statement. Now is an abstract word for time, "
+ "that is always flying. And my email address is google@gmail.com.";
InputStream is = getClass().getResourceAsStream("/models/en-sent.bin");
SentenceModel model = new SentenceModel(is);
SentenceDetectorME sdetector = new SentenceDetectorME(model);
String sentences[] = sdetector.sentDetect(paragraph);
assertThat(sentences).contains("This is a statement.",
"This is another statement.",
"Now is an abstract word for time, that is always flying.",
"And my email address is google@gmail.com.");
}
}

View File

@ -0,0 +1,36 @@
package com.baeldung.apache.opennlp;
import java.io.InputStream;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class TokenizerTest {
@Test
public void givenEnglishModel_whenTokenize_thenTokensAreDetected() throws Exception {
InputStream inputStream = getClass().getResourceAsStream("/models/en-token.bin");
TokenizerModel model = new TokenizerModel(inputStream);
TokenizerME tokenizer = new TokenizerME(model);
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}
@Test
public void givenWhitespaceTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
}
@Test
public void givenSimpleTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}
}

View File

@ -42,6 +42,7 @@
<module>apache-thrift</module>
<module>apache-curator</module>
<module>apache-zookeeper</module>
<module>apache-opennlp</module>
<module>autovalue</module>
<module>axon</module>
<module>bootique</module>