Introduction to OpenNLP (#2024)

* Introduction to OpenNLP * Introduction to OpenNLP
2017-06-08 16:54:35 -04:00 · 2017-06-08 16:54:35 -04:00 · b00ec6abfd
commit b00ec6abfd
parent 40dc547558
11 changed files with 343 additions and 1 deletions
--- a/libraries/OpenNLP/PartOfSpeechTag.txt
+++ b/libraries/OpenNLP/PartOfSpeechTag.txt
@ -0,0 +1 @@
 Out of the night that covers me
--- a/libraries/OpenNLP/doc-cat.train
+++ b/libraries/OpenNLP/doc-cat.train
@ -0,0 +1,10 @@
 GOOD good morning /
 GOOD good evening /
 GOOD have a good day /
 GOOD nice party! /
 GOOD fine pants /
 BAD nightmare volcano in the sea /
 BAD darkest sky /
 BAD greed and waste /
 BAD army attacks /
 BAD bomb explodes /
--- a/libraries/OpenNLP/en-chunker.bin
+++ b/libraries/OpenNLP/en-chunker.bin
--- a/libraries/OpenNLP/en-ner-location.bin
+++ b/libraries/OpenNLP/en-ner-location.bin
--- a/libraries/OpenNLP/en-ner-person.bin
+++ b/libraries/OpenNLP/en-ner-person.bin
--- a/libraries/OpenNLP/en-pos-maxent.bin
+++ b/libraries/OpenNLP/en-pos-maxent.bin
--- a/libraries/OpenNLP/en-sent.bin
+++ b/libraries/OpenNLP/en-sent.bin
--- a/libraries/OpenNLP/en-token.bin
+++ b/libraries/OpenNLP/en-token.bin
--- a/libraries/pom.xml
+++ b/libraries/pom.xml
@ -323,6 +323,13 @@
            <artifactId>netty-all</artifactId>
            <version>${netty.version}</version>
        </dependency>
 	<!-- OpenNLP -->
        <dependency>
            <groupId>org.apache.opennlp</groupId>
            <artifactId>opennlp-tools</artifactId>
            <version>1.8.0</version>
        </dependency>
    </dependencies>
    <properties>
        <multiverse.version>0.7.0</multiverse.version>
@ -350,4 +357,4 @@
        <netty.version>4.1.10.Final</netty.version>
    </properties>
-</project>
+</project>
--- a/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java
+++ b/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java
@ -0,0 +1,166 @@
 package com.baeldung.opennlp;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.logging.Logger;
 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
 import opennlp.tools.cmdline.postag.POSModelLoader;
 import opennlp.tools.doccat.DoccatFactory;
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.doccat.DocumentSampleStream;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.postag.POSSample;
 import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.TokenizerME;
 import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.InputStreamFactory;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
 public class OpenNLP {
    private final static Logger LOGGER = Logger.getLogger(OpenNLP.class.getName());
    private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south.";
    private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." };
    private DoccatModel docCatModel;
    public static void main(String[] args) {
        new OpenNLP();
    }
    public OpenNLP() {
        try {
            sentenceDetector();
            tokenizer();
            nameFinder();
            locationFinder();
            trainDocumentCategorizer();
            documentCategorizer();
            partOfSpeechTagger();
            chunker();
        } catch (InvalidFormatException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public void sentenceDetector() throws InvalidFormatException, IOException {
        InputStream is = new FileInputStream("OpenNLP/en-sent.bin");
        SentenceModel model = new SentenceModel(is);
        SentenceDetectorME sdetector = new SentenceDetectorME(model);
        String sentences[] = sdetector.sentDetect(text);
        Arrays.stream(sentences).forEach(LOGGER::info);
        is.close();
    }
    public void tokenizer() throws InvalidFormatException, IOException {
        InputStream is = new FileInputStream("OpenNLP/en-token.bin");
        TokenizerModel model = new TokenizerModel(is);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(text);
        Arrays.stream(tokens).forEach(LOGGER::info);
        is.close();
    }
    public static void nameFinder() throws IOException {
        InputStream is = new FileInputStream("OpenNLP/en-ner-person.bin");
        TokenNameFinderModel model = new TokenNameFinderModel(is);
        is.close();
        NameFinderME nameFinder = new NameFinderME(model);
        Span nameSpans[] = nameFinder.find(sentence);
        String[] names = Span.spansToStrings(nameSpans, sentence);
        Arrays.stream(names).forEach(LOGGER::info);
    }
    public static void locationFinder() throws IOException {
        InputStream is = new FileInputStream("OpenNLP/en-ner-location.bin");
        TokenNameFinderModel model = new TokenNameFinderModel(is);
        is.close();
        NameFinderME nameFinder = new NameFinderME(model);
        Span locationSpans[] = nameFinder.find(sentence);
        String[] locations = Span.spansToStrings(locationSpans, sentence);
        Arrays.stream(locations).forEach(LOGGER::info);
    }
    public void trainDocumentCategorizer() {
        try {
            InputStreamFactory isf = new InputStreamFactory() {
                public InputStream createInputStream() throws IOException {
                    return new FileInputStream("OpenNLP/doc-cat.train");
                }
            };
            ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
            ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
            DoccatFactory docCatFactory = new DoccatFactory();
            docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public void documentCategorizer() {
        DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel);
        double[] outcomes = myCategorizer.categorize(sentence);
        String category = myCategorizer.getBestCategory(outcomes);
        if (category.equalsIgnoreCase("GOOD")) {
            LOGGER.info("Document is positive :) ");
        } else {
            LOGGER.info("Document is negative :( ");
        }
    }
    public static void partOfSpeechTagger() throws IOException {
        try {
            POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin"));
            POSTaggerME posTaggerME = new POSTaggerME(posModel);
            InputStreamFactory isf = new InputStreamFactory() {
                public InputStream createInputStream() throws IOException {
                    return new FileInputStream("OpenNLP/PartOfSpeechTag.txt");
                }
            };
            ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
            String line;
            while ((line = lineStream.read()) != null) {
                String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
                String[] tags = posTaggerME.tag(whitespaceTokenizerLine);
                POSSample posSample = new POSSample(whitespaceTokenizerLine, tags);
                LOGGER.info(posSample.toString());
            }
            lineStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static void chunker() throws IOException {
        InputStream is = new FileInputStream("OpenNLP/en-chunker.bin");
        ChunkerModel cModel = new ChunkerModel(is);
        ChunkerME chunkerME = new ChunkerME(cModel);
        String[] taggedSentence = new String[] {"Out", "of", "the", "night", "that", "covers", "me"};
        String pos[] = new String[] { "IN", "IN", "DT", "NN", "WDT", "VBZ", "PRP"};
        String chunks[] = chunkerME.chunk(taggedSentence, pos);
        Arrays.stream(chunks).forEach(LOGGER::info);
    }
 }
--- a/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java
+++ b/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java
@ -0,0 +1,158 @@
 package com.baeldung.opennlp;
 import static org.junit.Assert.assertEquals;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import org.junit.Test;
 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
 import opennlp.tools.cmdline.postag.POSModelLoader;
 import opennlp.tools.doccat.DoccatFactory;
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.doccat.DocumentSampleStream;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.postag.POSSample;
 import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.InputStreamFactory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
 public class OpenNLPTests {
    private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south.";
    private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." };
    @Test
    public void givenText_WhenDetectSentences_ThenCountSentences(){
        InputStream is;
        SentenceModel model;
        try {
            is = new FileInputStream("OpenNLP/en-sent.bin");
            model = new SentenceModel(is);
            SentenceDetectorME sdetector = new SentenceDetectorME(model);
            String sentences[] = sdetector.sentDetect(text);
            assertEquals(4, sentences.length);
            is.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    @Test
    public void givenText_WhenDetectTokens_ThenVerifyNames(){
        InputStream is;
        TokenNameFinderModel model;
        try {
            is = new FileInputStream("OpenNLP/en-ner-person.bin");
            model = new TokenNameFinderModel(is);
            is.close();
            NameFinderME nameFinder = new NameFinderME(model);
            Span nameSpans[] = nameFinder.find(sentence);
            String[] names = Span.spansToStrings(nameSpans, sentence);
            assertEquals(1, names.length);
            assertEquals("James Jordan", names[0]);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    @Test
    public void givenText_WhenDetectTokens_ThenVerifyLocations(){
        InputStream is;
        TokenNameFinderModel model;
        try {
            is = new FileInputStream("OpenNLP/en-ner-location.bin");
            model = new TokenNameFinderModel(is);
            is.close();
            NameFinderME nameFinder = new NameFinderME(model);
            Span locationSpans[] = nameFinder.find(sentence);
            String[] locations = Span.spansToStrings(locationSpans, sentence);
            assertEquals(1, locations.length);
            assertEquals("Oklahoma", locations[0]);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    @Test
    public void givenText_WhenCategorizeDocument_ThenVerifyDocumentContent(){
        DoccatModel docCatModel;
        try {
            InputStreamFactory isf = new InputStreamFactory() {
                public InputStream createInputStream() throws IOException {
                    return new FileInputStream("OpenNLP/doc-cat.train");
                }
            };
            ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
            ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
            DoccatFactory docCatFactory = new DoccatFactory();
            docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory);
            DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel);
            double[] outcomes = myCategorizer.categorize(sentence);
            String category = myCategorizer.getBestCategory(outcomes);
            assertEquals("GOOD", category);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    @Test
    public void givenText_WhenTagDocument_ThenVerifyTaggedString(){
        try {
            POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin"));
            POSTaggerME posTaggerME = new POSTaggerME(posModel);
            InputStreamFactory isf = new InputStreamFactory() {
                public InputStream createInputStream() throws IOException {
                    return new FileInputStream("OpenNLP/PartOfSpeechTag.txt");
                }
            };
            ObjectStream<String> lineStream = new PlainTextByLineStream(isf, "UTF-8");
            String line;
            while ((line = lineStream.read()) != null) {
                String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
                String[] tags = posTaggerME.tag(whitespaceTokenizerLine);
                POSSample posSample = new POSSample(whitespaceTokenizerLine, tags);
                assertEquals("Out_IN of_IN the_DT night_NN that_WDT covers_VBZ me_PRP", posSample.toString());
            }
            lineStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    @Test
    public void givenText_WhenChunked_ThenCountChunks(){
        try {
            InputStream is = new FileInputStream("OpenNLP/en-chunker.bin");
            ChunkerModel cModel = new ChunkerModel(is);
            ChunkerME chunkerME = new ChunkerME(cModel);
            String pos[] = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN", "VBD"};
            String chunks[] = chunkerME.chunk(sentence, pos);
            assertEquals(7, chunks.length);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 }