BAEL-1557 Intro to Apache OpenNlp

This commit is contained in:
mansi2392 2018-03-30 11:28:42 +05:30
parent 764ccb54a9
commit db10f66e60
15 changed files with 301773 additions and 0 deletions

32
apache-opennlp/pom.xml Normal file
View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>
<artifactId>apache-opennlp</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>1.8.4</version>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<version>3.9.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,36 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
/**
*
* @author Parth
*/
public class ChunkerTest {
@Test
public void givenSentence_whenChunk_thenGetChunks() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
ChunkerME chunker = new ChunkerME(chunkerModel);
String[] chunks = chunker.chunk(tokens, tags);
assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
}
}

View File

@ -0,0 +1,41 @@
package com.baeldung.apache.opennlp;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import opennlp.tools.langdetect.Language;
import opennlp.tools.langdetect.LanguageDetector;
import opennlp.tools.langdetect.LanguageDetectorFactory;
import opennlp.tools.langdetect.LanguageDetectorME;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.langdetect.LanguageDetectorSampleStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class LanguageDetectorAndTrainingData {
@Test
public void test() throws FileNotFoundException, IOException {
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
params.put(TrainingParameters.CUTOFF_PARAM, 5);
params.put("DataIndexer", "TwoPass");
params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());
LanguageDetector ld = new LanguageDetectorME(model);
Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
assertThat(Arrays.asList(languages).toString()).contains("pob (0.9999999950605625)", "ita (4.939427661577956E-9)", "spa (9.665954064665144E-15)",
"fra (8.250349924885834E-25)");
}
}

View File

@ -0,0 +1,31 @@
package com.baeldung.apache.opennlp;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class LemmetizerTest {
@Test
public void givenSentence_whenLemmetize_thenGetLemmas() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
InputStream dictLemmatizer = new FileInputStream("src/main/resources/models/en-lemmatizer.dict");
DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
String[] lemmas = lemmatizer.lemmatize(tokens, tags);
assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
}
}

View File

@ -0,0 +1,40 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.Span;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class NamedEntityRecognitionTest {
@Test
public void givenTextWithPersonNames_whenNER_thenGetPersonNamesList() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
InputStream inputStreamNameFinder = new FileInputStream("src/main/resources/models/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
NameFinderME nameFinderME = new NameFinderME(model);
List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
List<String> names = new ArrayList<String>();
int k = 0;
for (Span s : spans) {
names.add("");
for (int index = s.getStart(); index < s.getEnd(); index++) {
names.set(k, names.get(k) + tokens[index]);
}
k++;
}
assertThat(names).contains("John","Leonard","Penny");
}
}

View File

@ -0,0 +1,25 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class POSTaggerTest {
@Test
public void givenSentence_whenPOSTagging_thenGetTags() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
}
}

View File

@ -0,0 +1,29 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class SentenceDetectionTest {
@Test
public void givenText_whenDetectSent_thenGetSentences() throws Exception {
String paragraph = "This is a statement. This is another statement. Now is an abstract word for time, "
+ "that is always flying. And my email address is google@gmail.com.";
InputStream is = new FileInputStream("src/main/resources/models/en-sent.bin");
SentenceModel model = new SentenceModel(is);
SentenceDetectorME sdetector = new SentenceDetectorME(model);
String sentences[] = sdetector.sentDetect(paragraph);
assertThat(sentences).contains("This is a statement.",
"This is another statement.",
"Now is an abstract word for time, that is always flying.",
"And my email address is google@gmail.com.");
}
}

View File

@ -0,0 +1,36 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class TokenizerTest {
@Test
public void givenString_whenTokenize_thenGetTokens() throws Exception {
FileInputStream fileInputStream = new FileInputStream("src/main/resources/models/en-token.bin");
TokenizerModel model = new TokenizerModel(fileInputStream);
TokenizerME tokenizer = new TokenizerME(model);
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}
@Test
public void givenString_whenWhitespaceTokenizer_thenGetTokens() throws Exception {
WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
}
@Test
public void givenString_whenSimpleTokenizer_thenGetTokens() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}
}