Merge pull request #3909 from mansi2392/master
BAEL-1557 Intro to Apache OpenNlp
This commit is contained in:
commit
b20a3a1097
|
@ -0,0 +1,32 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.baeldung</groupId>
|
||||
<artifactId>parent-modules</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>apache-opennlp</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.opennlp</groupId>
|
||||
<artifactId>opennlp-tools</artifactId>
|
||||
<version>1.8.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.assertj</groupId>
|
||||
<artifactId>assertj-core</artifactId>
|
||||
<version>3.9.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,32 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.chunker.ChunkerME;
|
||||
import opennlp.tools.chunker.ChunkerModel;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.postag.POSTaggerME;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class ChunkerTest {
|
||||
|
||||
@Test
|
||||
public void givenChunkerModel_whenChunk_thenChunksAreDetected() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");
|
||||
|
||||
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
|
||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
||||
String tags[] = posTagger.tag(tokens);
|
||||
|
||||
InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
|
||||
ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
|
||||
ChunkerME chunker = new ChunkerME(chunkerModel);
|
||||
String[] chunks = chunker.chunk(tokens, tags);
|
||||
assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import opennlp.tools.langdetect.Language;
|
||||
import opennlp.tools.langdetect.LanguageDetector;
|
||||
import opennlp.tools.langdetect.LanguageDetectorFactory;
|
||||
import opennlp.tools.langdetect.LanguageDetectorME;
|
||||
import opennlp.tools.langdetect.LanguageDetectorModel;
|
||||
import opennlp.tools.langdetect.LanguageDetectorSampleStream;
|
||||
import opennlp.tools.util.InputStreamFactory;
|
||||
import opennlp.tools.util.MarkableFileInputStreamFactory;
|
||||
import opennlp.tools.util.ObjectStream;
|
||||
import opennlp.tools.util.PlainTextByLineStream;
|
||||
import opennlp.tools.util.TrainingParameters;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class LanguageDetectorAndTrainingDataTest {
|
||||
|
||||
@Test
|
||||
public void givenLanguageDictionary_whenLanguageDetect_thenLanguageIsDetected() throws FileNotFoundException, IOException {
|
||||
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
|
||||
ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
|
||||
LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
|
||||
TrainingParameters params = new TrainingParameters();
|
||||
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
|
||||
params.put(TrainingParameters.CUTOFF_PARAM, 5);
|
||||
params.put("DataIndexer", "TwoPass");
|
||||
params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
|
||||
|
||||
LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());
|
||||
|
||||
LanguageDetector ld = new LanguageDetectorME(model);
|
||||
Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
|
||||
assertThat(Arrays.asList(languages).toString()).contains("pob (0.9999999950605625)", "ita (4.939427661577956E-9)", "spa (9.665954064665144E-15)",
|
||||
"fra (8.250349924885834E-25)");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.postag.POSTaggerME;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class LemmetizerTest {
|
||||
|
||||
@Test
|
||||
public void givenEnglishDictionary_whenLemmatize_thenLemmasAreDetected() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
|
||||
|
||||
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
|
||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
||||
String tags[] = posTagger.tag(tokens);
|
||||
InputStream dictLemmatizer = getClass().getResourceAsStream("/models/en-lemmatizer.dict");
|
||||
DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
|
||||
String[] lemmas = lemmatizer.lemmatize(tokens, tags);
|
||||
|
||||
assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import opennlp.tools.namefind.NameFinderME;
|
||||
import opennlp.tools.namefind.TokenNameFinderModel;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import opennlp.tools.util.Span;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class NamedEntityRecognitionTest {
|
||||
|
||||
@Test
|
||||
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
|
||||
|
||||
InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
|
||||
TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
|
||||
NameFinderME nameFinderME = new NameFinderME(model);
|
||||
List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
|
||||
assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
|
||||
List<String> names = new ArrayList<String>();
|
||||
int k = 0;
|
||||
for (Span s : spans) {
|
||||
names.add("");
|
||||
for (int index = s.getStart(); index < s.getEnd(); index++) {
|
||||
names.set(k, names.get(k) + tokens[index]);
|
||||
}
|
||||
k++;
|
||||
}
|
||||
assertThat(names).contains("John","Leonard","Penny");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.postag.POSTaggerME;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class POSTaggerTest {
|
||||
|
||||
@Test
|
||||
public void givenPOSModel_whenPOSTagging_thenPOSAreDetected() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
|
||||
|
||||
InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
|
||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
||||
String tags[] = posTagger.tag(tokens);
|
||||
assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SentenceDetectionTest {
|
||||
|
||||
@Test
|
||||
public void givenEnglishModel_whenDetect_thenSentencesAreDetected() throws Exception {
|
||||
|
||||
String paragraph = "This is a statement. This is another statement. Now is an abstract word for time, "
|
||||
+ "that is always flying. And my email address is google@gmail.com.";
|
||||
|
||||
InputStream is = getClass().getResourceAsStream("/models/en-sent.bin");
|
||||
SentenceModel model = new SentenceModel(is);
|
||||
|
||||
SentenceDetectorME sdetector = new SentenceDetectorME(model);
|
||||
|
||||
String sentences[] = sdetector.sentDetect(paragraph);
|
||||
assertThat(sentences).contains("This is a statement.",
|
||||
"This is another statement.",
|
||||
"Now is an abstract word for time, that is always flying.",
|
||||
"And my email address is google@gmail.com.");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import opennlp.tools.tokenize.TokenizerME;
|
||||
import opennlp.tools.tokenize.TokenizerModel;
|
||||
import opennlp.tools.tokenize.WhitespaceTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TokenizerTest {
|
||||
|
||||
@Test
|
||||
public void givenEnglishModel_whenTokenize_thenTokensAreDetected() throws Exception {
|
||||
InputStream inputStream = getClass().getResourceAsStream("/models/en-token.bin");
|
||||
TokenizerModel model = new TokenizerModel(inputStream);
|
||||
TokenizerME tokenizer = new TokenizerME(model);
|
||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenWhitespaceTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenSimpleTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue