Introduction to OpenNLP (#2024)

* Introduction to OpenNLP

* Introduction to OpenNLP
This commit is contained in:
Jesus Boadas 2017-06-08 16:54:35 -04:00 committed by maibin
parent 40dc547558
commit b00ec6abfd
11 changed files with 343 additions and 1 deletions

View File

@ -0,0 +1 @@
Out of the night that covers me

View File

@ -0,0 +1,10 @@
GOOD good morning /
GOOD good evening /
GOOD have a good day /
GOOD nice party! /
GOOD fine pants /
BAD nightmare volcano in the sea /
BAD darkest sky /
BAD greed and waste /
BAD army attacks /
BAD bomb explodes /

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -323,6 +323,13 @@
<artifactId>netty-all</artifactId> <artifactId>netty-all</artifactId>
<version>${netty.version}</version> <version>${netty.version}</version>
</dependency> </dependency>
<!-- OpenNLP -->
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>1.8.0</version>
</dependency>
</dependencies> </dependencies>
<properties> <properties>
<multiverse.version>0.7.0</multiverse.version> <multiverse.version>0.7.0</multiverse.version>
@ -350,4 +357,4 @@
<netty.version>4.1.10.Final</netty.version> <netty.version>4.1.10.Final</netty.version>
</properties> </properties>
</project> </project>

View File

@ -0,0 +1,166 @@
package com.baeldung.opennlp;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.logging.Logger;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.postag.POSModelLoader;
import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
public class OpenNLP {
private final static Logger LOGGER = Logger.getLogger(OpenNLP.class.getName());
private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south.";
private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." };
private DoccatModel docCatModel;
public static void main(String[] args) {
new OpenNLP();
}
public OpenNLP() {
try {
sentenceDetector();
tokenizer();
nameFinder();
locationFinder();
trainDocumentCategorizer();
documentCategorizer();
partOfSpeechTagger();
chunker();
} catch (InvalidFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public void sentenceDetector() throws InvalidFormatException, IOException {
InputStream is = new FileInputStream("OpenNLP/en-sent.bin");
SentenceModel model = new SentenceModel(is);
SentenceDetectorME sdetector = new SentenceDetectorME(model);
String sentences[] = sdetector.sentDetect(text);
Arrays.stream(sentences).forEach(LOGGER::info);
is.close();
}
public void tokenizer() throws InvalidFormatException, IOException {
InputStream is = new FileInputStream("OpenNLP/en-token.bin");
TokenizerModel model = new TokenizerModel(is);
Tokenizer tokenizer = new TokenizerME(model);
String tokens[] = tokenizer.tokenize(text);
Arrays.stream(tokens).forEach(LOGGER::info);
is.close();
}
public static void nameFinder() throws IOException {
InputStream is = new FileInputStream("OpenNLP/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
Span nameSpans[] = nameFinder.find(sentence);
String[] names = Span.spansToStrings(nameSpans, sentence);
Arrays.stream(names).forEach(LOGGER::info);
}
public static void locationFinder() throws IOException {
InputStream is = new FileInputStream("OpenNLP/en-ner-location.bin");
TokenNameFinderModel model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
Span locationSpans[] = nameFinder.find(sentence);
String[] locations = Span.spansToStrings(locationSpans, sentence);
Arrays.stream(locations).forEach(LOGGER::info);
}
public void trainDocumentCategorizer() {
try {
InputStreamFactory isf = new InputStreamFactory() {
public InputStream createInputStream() throws IOException {
return new FileInputStream("OpenNLP/doc-cat.train");
}
};
ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
DoccatFactory docCatFactory = new DoccatFactory();
docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory);
} catch (IOException e) {
e.printStackTrace();
}
}
public void documentCategorizer() {
DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel);
double[] outcomes = myCategorizer.categorize(sentence);
String category = myCategorizer.getBestCategory(outcomes);
if (category.equalsIgnoreCase("GOOD")) {
LOGGER.info("Document is positive :) ");
} else {
LOGGER.info("Document is negative :( ");
}
}
public static void partOfSpeechTagger() throws IOException {
try {
POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin"));
POSTaggerME posTaggerME = new POSTaggerME(posModel);
InputStreamFactory isf = new InputStreamFactory() {
public InputStream createInputStream() throws IOException {
return new FileInputStream("OpenNLP/PartOfSpeechTag.txt");
}
};
ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
String line;
while ((line = lineStream.read()) != null) {
String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
String[] tags = posTaggerME.tag(whitespaceTokenizerLine);
POSSample posSample = new POSSample(whitespaceTokenizerLine, tags);
LOGGER.info(posSample.toString());
}
lineStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void chunker() throws IOException {
InputStream is = new FileInputStream("OpenNLP/en-chunker.bin");
ChunkerModel cModel = new ChunkerModel(is);
ChunkerME chunkerME = new ChunkerME(cModel);
String[] taggedSentence = new String[] {"Out", "of", "the", "night", "that", "covers", "me"};
String pos[] = new String[] { "IN", "IN", "DT", "NN", "WDT", "VBZ", "PRP"};
String chunks[] = chunkerME.chunk(taggedSentence, pos);
Arrays.stream(chunks).forEach(LOGGER::info);
}
}

View File

@ -0,0 +1,158 @@
package com.baeldung.opennlp;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.junit.Test;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.postag.POSModelLoader;
import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.DocumentSampleStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
public class OpenNLPTests {
private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south.";
private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." };
@Test
public void givenText_WhenDetectSentences_ThenCountSentences(){
InputStream is;
SentenceModel model;
try {
is = new FileInputStream("OpenNLP/en-sent.bin");
model = new SentenceModel(is);
SentenceDetectorME sdetector = new SentenceDetectorME(model);
String sentences[] = sdetector.sentDetect(text);
assertEquals(4, sentences.length);
is.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void givenText_WhenDetectTokens_ThenVerifyNames(){
InputStream is;
TokenNameFinderModel model;
try {
is = new FileInputStream("OpenNLP/en-ner-person.bin");
model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
Span nameSpans[] = nameFinder.find(sentence);
String[] names = Span.spansToStrings(nameSpans, sentence);
assertEquals(1, names.length);
assertEquals("James Jordan", names[0]);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void givenText_WhenDetectTokens_ThenVerifyLocations(){
InputStream is;
TokenNameFinderModel model;
try {
is = new FileInputStream("OpenNLP/en-ner-location.bin");
model = new TokenNameFinderModel(is);
is.close();
NameFinderME nameFinder = new NameFinderME(model);
Span locationSpans[] = nameFinder.find(sentence);
String[] locations = Span.spansToStrings(locationSpans, sentence);
assertEquals(1, locations.length);
assertEquals("Oklahoma", locations[0]);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void givenText_WhenCategorizeDocument_ThenVerifyDocumentContent(){
DoccatModel docCatModel;
try {
InputStreamFactory isf = new InputStreamFactory() {
public InputStream createInputStream() throws IOException {
return new FileInputStream("OpenNLP/doc-cat.train");
}
};
ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
DoccatFactory docCatFactory = new DoccatFactory();
docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory);
DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel);
double[] outcomes = myCategorizer.categorize(sentence);
String category = myCategorizer.getBestCategory(outcomes);
assertEquals("GOOD", category);
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void givenText_WhenTagDocument_ThenVerifyTaggedString(){
try {
POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin"));
POSTaggerME posTaggerME = new POSTaggerME(posModel);
InputStreamFactory isf = new InputStreamFactory() {
public InputStream createInputStream() throws IOException {
return new FileInputStream("OpenNLP/PartOfSpeechTag.txt");
}
};
ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
String line;
while ((line = lineStream.read()) != null) {
String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
String[] tags = posTaggerME.tag(whitespaceTokenizerLine);
POSSample posSample = new POSSample(whitespaceTokenizerLine, tags);
assertEquals("Out_IN of_IN the_DT night_NN that_WDT covers_VBZ me_PRP", posSample.toString());
}
lineStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void givenText_WhenChunked_ThenCountChunks(){
try {
InputStream is = new FileInputStream("OpenNLP/en-chunker.bin");
ChunkerModel cModel = new ChunkerModel(is);
ChunkerME chunkerME = new ChunkerME(cModel);
String pos[] = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN", "VBD"};
String chunks[] = chunkerME.chunk(sentence, pos);
assertEquals(7, chunks.length);
} catch (IOException e) {
e.printStackTrace();
}
}
}