From b0116c225e9ceeee98f7b8aa4638f6fa71149bb8 Mon Sep 17 00:00:00 2001 From: Michael Olayemi Date: Sat, 22 Apr 2023 06:32:53 +0100 Subject: [PATCH] BAEL-5766 Overview of NLP Libraries in Java (#13798) * BAEL-5766 Overview of NLP Libraries in Java * BAEL-5766 Overview of NLP Libraries in Java * Overview of NLP Libraries in Java * Overview of NLP Libraries in Java --- libraries-ai/README.md | 1 + libraries-ai/pom.xml | 33 +++++++++++++++ .../nlp/CoreNLPTokenizerUnitTest.java | 41 +++++++++++++++++++ .../baeldung/nlp/OpenNLPLanguageDetector.java | 33 +++++++++++++++ pom.xml | 1 + 5 files changed, 109 insertions(+) create mode 100644 libraries-ai/README.md create mode 100644 libraries-ai/pom.xml create mode 100644 libraries-ai/src/test/java/com/baeldung/nlp/CoreNLPTokenizerUnitTest.java create mode 100644 libraries-ai/src/test/java/com/baeldung/nlp/OpenNLPLanguageDetector.java diff --git a/libraries-ai/README.md b/libraries-ai/README.md new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/libraries-ai/README.md @@ -0,0 +1 @@ + diff --git a/libraries-ai/pom.xml b/libraries-ai/pom.xml new file mode 100644 index 0000000000..2c1490440f --- /dev/null +++ b/libraries-ai/pom.xml @@ -0,0 +1,33 @@ + + + 4.0.0 + libraries-ai + libraries-ai + + + com.baeldung + parent-modules + 1.0.0-SNAPSHOT + + + + + edu.stanford.nlp + stanford-corenlp + ${stanford-corenlp.version} + + + org.apache.opennlp + opennlp-tools + ${opennlp-tools.version} + + + + + 4.5.3 + 2.1.1 + + + \ No newline at end of file diff --git a/libraries-ai/src/test/java/com/baeldung/nlp/CoreNLPTokenizerUnitTest.java b/libraries-ai/src/test/java/com/baeldung/nlp/CoreNLPTokenizerUnitTest.java new file mode 100644 index 0000000000..11c5cd0be8 --- /dev/null +++ b/libraries-ai/src/test/java/com/baeldung/nlp/CoreNLPTokenizerUnitTest.java @@ -0,0 +1,41 @@ +package com.baeldung.nlp; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; +import org.junit.Test; + +import java.util.List; +import java.util.Properties; + +import static org.junit.Assert.assertEquals; + +public class CoreNLPTokenizerUnitTest { + @Test + public void givenSampleText_whenTokenize_thenExpectedTokensReturned() { + + Properties props = new Properties(); + props.setProperty("annotators", "tokenize"); + + StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + String text = "The german shepard display an act of kindness"; + + Annotation document = new Annotation(text); + pipeline.annotate(document); + + List sentences = document.get(CoreAnnotations.SentencesAnnotation.class); + StringBuilder tokens = new StringBuilder(); + + for (CoreMap sentence : sentences) { + for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { + String word = token.get(CoreAnnotations.TextAnnotation.class); + tokens.append(word) + .append(" "); + } + } + assertEquals("The german shepard display an act of kindness", tokens.toString() + .trim()); + } +} diff --git a/libraries-ai/src/test/java/com/baeldung/nlp/OpenNLPLanguageDetector.java b/libraries-ai/src/test/java/com/baeldung/nlp/OpenNLPLanguageDetector.java new file mode 100644 index 0000000000..00792b4875 --- /dev/null +++ b/libraries-ai/src/test/java/com/baeldung/nlp/OpenNLPLanguageDetector.java @@ -0,0 +1,33 @@ +package com.baeldung.nlp; + +import opennlp.tools.langdetect.Language; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; +import org.junit.jupiter.api.Test; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class OpenNLPLanguageDetector { + + @Test + public void givenTextInEnglish_whenDetectLanguage_thenReturnsEnglishLanguageCode() { + + String text = "the dream my father told me"; + LanguageDetectorModel model; + + try (InputStream modelIn = new FileInputStream("langdetect-183.bin")) { + model = new LanguageDetectorModel(modelIn); + } catch (IOException e) { + return; + } + + LanguageDetectorME detector = new LanguageDetectorME(model); + Language language = detector.predictLanguage(text); + + assertEquals("eng", language.getLang()); + } +} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 0b5b73ddfd..9cab85f40b 100644 --- a/pom.xml +++ b/pom.xml @@ -1148,6 +1148,7 @@ libraries-http libraries-http-2 libraries-io + libraries-ai libraries-primitive libraries-rpc libraries-server