JAVA-1848: Moved apache-opennlp to apache-libraries
This commit is contained in:
parent
1f57916d37
commit
d06218cc14
|
@ -1,7 +0,0 @@
|
||||||
## Apache OpenNLP
|
|
||||||
|
|
||||||
This module contains articles about Apache OpenNLP
|
|
||||||
|
|
||||||
### Relevant Articles
|
|
||||||
|
|
||||||
- [Intro to Apache OpenNLP](https://www.baeldung.com/apache-open-nlp)
|
|
|
@ -1,37 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project
|
|
||||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
<artifactId>apache-opennlp</artifactId>
|
|
||||||
<version>1.0-SNAPSHOT</version>
|
|
||||||
<name>apache-opennlp</name>
|
|
||||||
<packaging>jar</packaging>
|
|
||||||
|
|
||||||
<parent>
|
|
||||||
<groupId>com.baeldung</groupId>
|
|
||||||
<artifactId>parent-modules</artifactId>
|
|
||||||
<version>1.0.0-SNAPSHOT</version>
|
|
||||||
</parent>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.opennlp</groupId>
|
|
||||||
<artifactId>opennlp-tools</artifactId>
|
|
||||||
<version>${org.apache.opennlp.opennlp-tools.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.assertj</groupId>
|
|
||||||
<artifactId>assertj-core</artifactId>
|
|
||||||
<version>${org.assertj.version}</version>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
<properties>
|
|
||||||
<org.assertj.version>3.9.0</org.assertj.version>
|
|
||||||
<org.apache.opennlp.opennlp-tools.version>1.8.4</org.apache.opennlp.opennlp-tools.version>
|
|
||||||
</properties>
|
|
||||||
|
|
||||||
</project>
|
|
|
@ -1,13 +0,0 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<configuration>
|
|
||||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
|
||||||
<encoder>
|
|
||||||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
|
|
||||||
</pattern>
|
|
||||||
</encoder>
|
|
||||||
</appender>
|
|
||||||
|
|
||||||
<root level="INFO">
|
|
||||||
<appender-ref ref="STDOUT" />
|
|
||||||
</root>
|
|
||||||
</configuration>
|
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
|
@ -1,32 +0,0 @@
|
||||||
package com.baeldung.apache.opennlp;
|
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import opennlp.tools.chunker.ChunkerME;
|
|
||||||
import opennlp.tools.chunker.ChunkerModel;
|
|
||||||
import opennlp.tools.postag.POSModel;
|
|
||||||
import opennlp.tools.postag.POSTaggerME;
|
|
||||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class ChunkerUnitTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenChunkerModel_whenChunk_thenChunksAreDetected() throws Exception {
|
|
||||||
|
|
||||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
|
||||||
String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");
|
|
||||||
|
|
||||||
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
|
|
||||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
|
||||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
|
||||||
String tags[] = posTagger.tag(tokens);
|
|
||||||
|
|
||||||
InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
|
|
||||||
ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
|
|
||||||
ChunkerME chunker = new ChunkerME(chunkerModel);
|
|
||||||
String[] chunks = chunker.chunk(tokens, tags);
|
|
||||||
assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,44 +0,0 @@
|
||||||
package com.baeldung.apache.opennlp;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import opennlp.tools.langdetect.Language;
|
|
||||||
import opennlp.tools.langdetect.LanguageDetector;
|
|
||||||
import opennlp.tools.langdetect.LanguageDetectorFactory;
|
|
||||||
import opennlp.tools.langdetect.LanguageDetectorME;
|
|
||||||
import opennlp.tools.langdetect.LanguageDetectorModel;
|
|
||||||
import opennlp.tools.langdetect.LanguageDetectorSampleStream;
|
|
||||||
import opennlp.tools.util.InputStreamFactory;
|
|
||||||
import opennlp.tools.util.MarkableFileInputStreamFactory;
|
|
||||||
import opennlp.tools.util.ObjectStream;
|
|
||||||
import opennlp.tools.util.PlainTextByLineStream;
|
|
||||||
import opennlp.tools.util.TrainingParameters;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
import static org.assertj.core.api.Assertions.tuple;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class LanguageDetectorAndTrainingDataUnitTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenLanguageDictionary_whenLanguageDetect_thenLanguageIsDetected() throws FileNotFoundException, IOException {
|
|
||||||
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
|
|
||||||
ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
|
|
||||||
LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
|
|
||||||
TrainingParameters params = new TrainingParameters();
|
|
||||||
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
|
|
||||||
params.put(TrainingParameters.CUTOFF_PARAM, 5);
|
|
||||||
params.put("DataIndexer", "TwoPass");
|
|
||||||
params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
|
|
||||||
|
|
||||||
LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());
|
|
||||||
|
|
||||||
LanguageDetector ld = new LanguageDetectorME(model);
|
|
||||||
Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
|
|
||||||
|
|
||||||
assertThat(Arrays.asList(languages)).extracting("lang", "confidence").contains(tuple("pob", 0.9999999950605625),
|
|
||||||
tuple("ita", 4.939427661577956E-9), tuple("spa", 9.665954064665144E-15),
|
|
||||||
tuple("fra", 8.250349924885834E-25));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,29 +0,0 @@
|
||||||
package com.baeldung.apache.opennlp;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
|
||||||
import opennlp.tools.postag.POSModel;
|
|
||||||
import opennlp.tools.postag.POSTaggerME;
|
|
||||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class LemmetizerUnitTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenEnglishDictionary_whenLemmatize_thenLemmasAreDetected() throws Exception {
|
|
||||||
|
|
||||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
|
||||||
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
|
|
||||||
|
|
||||||
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
|
|
||||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
|
||||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
|
||||||
String tags[] = posTagger.tag(tokens);
|
|
||||||
InputStream dictLemmatizer = getClass().getResourceAsStream("/models/en-lemmatizer.dict");
|
|
||||||
DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
|
|
||||||
String[] lemmas = lemmatizer.lemmatize(tokens, tags);
|
|
||||||
|
|
||||||
assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,39 +0,0 @@
|
||||||
package com.baeldung.apache.opennlp;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import opennlp.tools.namefind.NameFinderME;
|
|
||||||
import opennlp.tools.namefind.TokenNameFinderModel;
|
|
||||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
|
||||||
import opennlp.tools.util.Span;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class NamedEntityRecognitionUnitTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
|
|
||||||
|
|
||||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
|
||||||
String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
|
|
||||||
|
|
||||||
InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
|
|
||||||
TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
|
|
||||||
NameFinderME nameFinderME = new NameFinderME(model);
|
|
||||||
List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
|
|
||||||
assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
|
|
||||||
List<String> names = new ArrayList<String>();
|
|
||||||
int k = 0;
|
|
||||||
for (Span s : spans) {
|
|
||||||
names.add("");
|
|
||||||
for (int index = s.getStart(); index < s.getEnd(); index++) {
|
|
||||||
names.set(k, names.get(k) + tokens[index]);
|
|
||||||
}
|
|
||||||
k++;
|
|
||||||
}
|
|
||||||
assertThat(names).contains("John","Leonard","Penny");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,24 +0,0 @@
|
||||||
package com.baeldung.apache.opennlp;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import opennlp.tools.postag.POSModel;
|
|
||||||
import opennlp.tools.postag.POSTaggerME;
|
|
||||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class POSTaggerUnitTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenPOSModel_whenPOSTagging_thenPOSAreDetected() throws Exception {
|
|
||||||
|
|
||||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
|
||||||
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
|
|
||||||
|
|
||||||
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
|
|
||||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
|
||||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
|
||||||
String tags[] = posTagger.tag(tokens);
|
|
||||||
assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,28 +0,0 @@
|
||||||
package com.baeldung.apache.opennlp;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
|
||||||
import opennlp.tools.sentdetect.SentenceModel;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class SentenceDetectionUnitTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenEnglishModel_whenDetect_thenSentencesAreDetected() throws Exception {
|
|
||||||
|
|
||||||
String paragraph = "This is a statement. This is another statement. Now is an abstract word for time, "
|
|
||||||
+ "that is always flying. And my email address is google@gmail.com.";
|
|
||||||
|
|
||||||
InputStream is = getClass().getResourceAsStream("/models/en-sent.bin");
|
|
||||||
SentenceModel model = new SentenceModel(is);
|
|
||||||
|
|
||||||
SentenceDetectorME sdetector = new SentenceDetectorME(model);
|
|
||||||
|
|
||||||
String sentences[] = sdetector.sentDetect(paragraph);
|
|
||||||
assertThat(sentences).contains("This is a statement.",
|
|
||||||
"This is another statement.",
|
|
||||||
"Now is an abstract word for time, that is always flying.",
|
|
||||||
"And my email address is google@gmail.com.");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
package com.baeldung.apache.opennlp;
|
|
||||||
|
|
||||||
import java.io.InputStream;
|
|
||||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
|
||||||
import opennlp.tools.tokenize.TokenizerME;
|
|
||||||
import opennlp.tools.tokenize.TokenizerModel;
|
|
||||||
import opennlp.tools.tokenize.WhitespaceTokenizer;
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class TokenizerUnitTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenEnglishModel_whenTokenize_thenTokensAreDetected() throws Exception {
|
|
||||||
InputStream inputStream = getClass().getResourceAsStream("/models/en-token.bin");
|
|
||||||
TokenizerModel model = new TokenizerModel(inputStream);
|
|
||||||
TokenizerME tokenizer = new TokenizerME(model);
|
|
||||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
|
||||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenWhitespaceTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
|
|
||||||
WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
|
|
||||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
|
||||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void givenSimpleTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
|
|
||||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
|
||||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
|
||||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue