diff --git a/libraries/OpenNLP/PartOfSpeechTag.txt b/libraries/OpenNLP/PartOfSpeechTag.txt deleted file mode 100644 index fdd8238ec4..0000000000 --- a/libraries/OpenNLP/PartOfSpeechTag.txt +++ /dev/null @@ -1 +0,0 @@ -Out of the night that covers me \ No newline at end of file diff --git a/libraries/OpenNLP/doc-cat.train b/libraries/OpenNLP/doc-cat.train deleted file mode 100644 index c457221ec6..0000000000 --- a/libraries/OpenNLP/doc-cat.train +++ /dev/null @@ -1,10 +0,0 @@ -GOOD good morning / -GOOD good evening / -GOOD have a good day / -GOOD nice party! / -GOOD fine pants / -BAD nightmare volcano in the sea / -BAD darkest sky / -BAD greed and waste / -BAD army attacks / -BAD bomb explodes / \ No newline at end of file diff --git a/libraries/OpenNLP/en-chunker.bin b/libraries/OpenNLP/en-chunker.bin deleted file mode 100644 index 65d9356888..0000000000 Binary files a/libraries/OpenNLP/en-chunker.bin and /dev/null differ diff --git a/libraries/OpenNLP/en-ner-location.bin b/libraries/OpenNLP/en-ner-location.bin deleted file mode 100644 index f3788bc1f6..0000000000 Binary files a/libraries/OpenNLP/en-ner-location.bin and /dev/null differ diff --git a/libraries/OpenNLP/en-ner-person.bin b/libraries/OpenNLP/en-ner-person.bin deleted file mode 100644 index 2f68318203..0000000000 Binary files a/libraries/OpenNLP/en-ner-person.bin and /dev/null differ diff --git a/libraries/OpenNLP/en-pos-maxent.bin b/libraries/OpenNLP/en-pos-maxent.bin deleted file mode 100644 index c8cae23c5f..0000000000 Binary files a/libraries/OpenNLP/en-pos-maxent.bin and /dev/null differ diff --git a/libraries/OpenNLP/en-sent.bin b/libraries/OpenNLP/en-sent.bin deleted file mode 100644 index e89076be5a..0000000000 Binary files a/libraries/OpenNLP/en-sent.bin and /dev/null differ diff --git a/libraries/OpenNLP/en-token.bin b/libraries/OpenNLP/en-token.bin deleted file mode 100644 index c417277ca7..0000000000 Binary files a/libraries/OpenNLP/en-token.bin and /dev/null differ diff --git a/libraries/pom.xml b/libraries/pom.xml index ff9c72399d..a3b78f1695 100644 --- a/libraries/pom.xml +++ b/libraries/pom.xml @@ -338,12 +338,6 @@ netty-all ${netty.version} - - - org.apache.opennlp - opennlp-tools - 1.8.0 - junit junit diff --git a/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java b/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java deleted file mode 100644 index dd44e96a3a..0000000000 --- a/libraries/src/main/java/com/baeldung/opennlp/OpenNLP.java +++ /dev/null @@ -1,188 +0,0 @@ -package com.baeldung.opennlp; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; -import java.util.logging.Logger; - -import opennlp.tools.chunker.ChunkerME; -import opennlp.tools.chunker.ChunkerModel; -import opennlp.tools.cmdline.postag.POSModelLoader; -import opennlp.tools.doccat.DoccatFactory; -import opennlp.tools.doccat.DoccatModel; -import opennlp.tools.doccat.DocumentCategorizerME; -import opennlp.tools.doccat.DocumentSample; -import opennlp.tools.doccat.DocumentSampleStream; -import opennlp.tools.namefind.NameFinderME; -import opennlp.tools.namefind.TokenNameFinderModel; -import opennlp.tools.postag.POSModel; -import opennlp.tools.postag.POSSample; -import opennlp.tools.postag.POSTaggerME; -import opennlp.tools.sentdetect.SentenceDetectorME; -import opennlp.tools.sentdetect.SentenceModel; -import opennlp.tools.tokenize.Tokenizer; -import opennlp.tools.tokenize.TokenizerME; -import opennlp.tools.tokenize.TokenizerModel; -import opennlp.tools.tokenize.WhitespaceTokenizer; -import opennlp.tools.util.InputStreamFactory; -import opennlp.tools.util.InvalidFormatException; -import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; -import opennlp.tools.util.Span; -import opennlp.tools.util.TrainingParameters; - -public class OpenNLP { - - private final static Logger LOGGER = Logger.getLogger(OpenNLP.class.getName()); - private final static String text = buildString(); - private final static String sentence[] = new String[] { "James", "Jordan", "live", "in", "Oklahoma", "city", "." }; - - private DoccatModel docCatModel; - - public static void main(String[] args) { - new OpenNLP(); - } - - public static String buildString(){ - StringBuilder sb = new StringBuilder(); - sb.append("To get to the south:"); - sb.append(" Go to the store."); - sb.append(" Buy a compass."); - sb.append(" Use the compass."); - sb.append(" Then walk to the south."); - return sb.toString(); - } - - public OpenNLP() { - try { - sentenceDetector(); - tokenizer(); - nameFinder(); - locationFinder(); - trainDocumentCategorizer(); - documentCategorizer(); - partOfSpeechTagger(); - chunker(); - } catch (InvalidFormatException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public void sentenceDetector() throws InvalidFormatException, IOException { - - InputStream is = new FileInputStream("OpenNLP/en-sent.bin"); - SentenceModel model = new SentenceModel(is); - SentenceDetectorME sdetector = new SentenceDetectorME(model); - String sentences[] = sdetector.sentDetect(text); - for (String sentence : sentences) { - LOGGER.info(sentence); - } - is.close(); - } - - public void tokenizer() throws InvalidFormatException, IOException { - InputStream is = new FileInputStream("OpenNLP/en-token.bin"); - TokenizerModel model = new TokenizerModel(is); - Tokenizer tokenizer = new TokenizerME(model); - String tokens[] = tokenizer.tokenize(text); - for (String token : tokens) { - LOGGER.info(token); - } - is.close(); - } - - public static void nameFinder() throws IOException { - InputStream is = new FileInputStream("OpenNLP/en-ner-person.bin"); - TokenNameFinderModel model = new TokenNameFinderModel(is); - is.close(); - NameFinderME nameFinder = new NameFinderME(model); - Span nameSpans[] = nameFinder.find(sentence); - String[] names = Span.spansToStrings(nameSpans, sentence); - Arrays.stream(names).forEach(LOGGER::info); - for (String name : names) { - LOGGER.info(name); - } - } - - public static void locationFinder() throws IOException { - InputStream is = new FileInputStream("OpenNLP/en-ner-location.bin"); - TokenNameFinderModel model = new TokenNameFinderModel(is); - is.close(); - NameFinderME nameFinder = new NameFinderME(model); - Span locationSpans[] = nameFinder.find(sentence); - String[] locations = Span.spansToStrings(locationSpans, sentence); - Arrays.stream(locations).forEach(LOGGER::info); - for (String location : locations) { - LOGGER.info(location); - } - } - - public void trainDocumentCategorizer() { - - try { - InputStreamFactory isf = new InputStreamFactory() { - public InputStream createInputStream() throws IOException { - return new FileInputStream("OpenNLP/doc-cat.train"); - } - }; - ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); - ObjectStream sampleStream = new DocumentSampleStream(lineStream); - DoccatFactory docCatFactory = new DoccatFactory(); - docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public void documentCategorizer() { - DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel); - double[] outcomes = myCategorizer.categorize(sentence); - String category = myCategorizer.getBestCategory(outcomes); - - if (category.equalsIgnoreCase("GOOD")) { - LOGGER.info("Document is positive :) "); - } else { - LOGGER.info("Document is negative :( "); - } - } - - public static void partOfSpeechTagger() throws IOException { - try { - POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin")); - POSTaggerME posTaggerME = new POSTaggerME(posModel); - InputStreamFactory isf = new InputStreamFactory() { - public InputStream createInputStream() throws IOException { - return new FileInputStream("OpenNLP/PartOfSpeechTag.txt"); - } - }; - ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); - String line; - while ((line = lineStream.read()) != null) { - String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line); - String[] tags = posTaggerME.tag(whitespaceTokenizerLine); - POSSample posSample = new POSSample(whitespaceTokenizerLine, tags); - LOGGER.info(posSample.toString()); - } - lineStream.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static void chunker() throws IOException { - InputStream is = new FileInputStream("OpenNLP/en-chunker.bin"); - ChunkerModel cModel = new ChunkerModel(is); - ChunkerME chunkerME = new ChunkerME(cModel); - String[] taggedSentence = new String[] {"Out", "of", "the", "night", "that", "covers", "me"}; - String pos[] = new String[] { "IN", "IN", "DT", "NN", "WDT", "VBZ", "PRP"}; - String chunks[] = chunkerME.chunk(taggedSentence, pos); - for (String chunk : chunks) { - LOGGER.info(chunk); - } - } - -} diff --git a/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java b/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java deleted file mode 100644 index 38bc8e002b..0000000000 --- a/libraries/src/test/java/com/baeldung/opennlp/OpenNLPTests.java +++ /dev/null @@ -1,151 +0,0 @@ -package com.baeldung.opennlp; - -import opennlp.tools.chunker.ChunkerME; -import opennlp.tools.chunker.ChunkerModel; -import opennlp.tools.cmdline.postag.POSModelLoader; -import opennlp.tools.doccat.DoccatFactory; -import opennlp.tools.doccat.DoccatModel; -import opennlp.tools.doccat.DocumentCategorizerME; -import opennlp.tools.doccat.DocumentSample; -import opennlp.tools.doccat.DocumentSampleStream; -import opennlp.tools.namefind.NameFinderME; -import opennlp.tools.namefind.TokenNameFinderModel; -import opennlp.tools.postag.POSModel; -import opennlp.tools.postag.POSSample; -import opennlp.tools.postag.POSTaggerME; -import opennlp.tools.sentdetect.SentenceDetectorME; -import opennlp.tools.sentdetect.SentenceModel; -import opennlp.tools.tokenize.WhitespaceTokenizer; -import opennlp.tools.util.InputStreamFactory; -import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.PlainTextByLineStream; -import opennlp.tools.util.Span; -import opennlp.tools.util.TrainingParameters; -import org.junit.Test; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; - -import static org.junit.Assert.assertEquals; - -public class OpenNLPTests { - - private final static String text = "To get to the south: Go to the store. Buy a compass. Use the compass. Then walk to the south."; - private final static String sentence[] = new String[]{"James", "Jordan", "live", "in", "Oklahoma", "city", "."}; - - @Test - public void givenText_WhenDetectSentences_ThenCountSentences() { - InputStream is; - SentenceModel model; - try { - is = new FileInputStream("OpenNLP/en-sent.bin"); - model = new SentenceModel(is); - SentenceDetectorME sdetector = new SentenceDetectorME(model); - String sentences[] = sdetector.sentDetect(text); - assertEquals(4, sentences.length); - is.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Test - public void givenText_WhenDetectTokens_ThenVerifyNames() { - InputStream is; - TokenNameFinderModel model; - try { - is = new FileInputStream("OpenNLP/en-ner-person.bin"); - model = new TokenNameFinderModel(is); - is.close(); - NameFinderME nameFinder = new NameFinderME(model); - Span nameSpans[] = nameFinder.find(sentence); - String[] names = Span.spansToStrings(nameSpans, sentence); - assertEquals(1, names.length); - assertEquals("James Jordan", names[0]); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Test - public void givenText_WhenDetectTokens_ThenVerifyLocations() { - InputStream is; - TokenNameFinderModel model; - try { - is = new FileInputStream("OpenNLP/en-ner-location.bin"); - model = new TokenNameFinderModel(is); - is.close(); - NameFinderME nameFinder = new NameFinderME(model); - Span locationSpans[] = nameFinder.find(sentence); - String[] locations = Span.spansToStrings(locationSpans, sentence); - assertEquals(1, locations.length); - assertEquals("Oklahoma", locations[0]); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Test - public void givenText_WhenCategorizeDocument_ThenVerifyDocumentContent() { - DoccatModel docCatModel; - try { - InputStreamFactory isf = new InputStreamFactory() { - public InputStream createInputStream() throws IOException { - return new FileInputStream("OpenNLP/doc-cat.train"); - } - }; - ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); - ObjectStream sampleStream = new DocumentSampleStream(lineStream); - DoccatFactory docCatFactory = new DoccatFactory(); - docCatModel = DocumentCategorizerME.train("en", sampleStream, TrainingParameters.defaultParams(), docCatFactory); - DocumentCategorizerME myCategorizer = new DocumentCategorizerME(docCatModel); - double[] outcomes = myCategorizer.categorize(sentence); - String category = myCategorizer.getBestCategory(outcomes); - assertEquals("GOOD", category); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Test - public void givenText_WhenTagDocument_ThenVerifyTaggedString() { - try { - POSModel posModel = new POSModelLoader().load(new File("OpenNLP/en-pos-maxent.bin")); - POSTaggerME posTaggerME = new POSTaggerME(posModel); - InputStreamFactory isf = new InputStreamFactory() { - public InputStream createInputStream() throws IOException { - return new FileInputStream("OpenNLP/PartOfSpeechTag.txt"); - } - }; - ObjectStream lineStream = new PlainTextByLineStream(isf, "UTF-8"); - String line; - while ((line = lineStream.read()) != null) { - String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line); - String[] tags = posTaggerME.tag(whitespaceTokenizerLine); - POSSample posSample = new POSSample(whitespaceTokenizerLine, tags); - assertEquals("Out_IN of_IN the_DT night_NN that_WDT covers_VBZ me_PRP", posSample.toString()); - } - lineStream.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Test - public void givenText_WhenChunked_ThenCountChunks() { - try { - InputStream is = new FileInputStream("OpenNLP/en-chunker.bin"); - ChunkerModel cModel = new ChunkerModel(is); - ChunkerME chunkerME = new ChunkerME(cModel); - String pos[] = new String[]{"NNP", "NNP", "NNP", "POS", "NNP", "NN", "VBD"}; - String chunks[] = chunkerME.chunk(sentence, pos); - assertEquals(7, chunks.length); - } catch (IOException e) { - e.printStackTrace(); - } - } - -}