BAEL-1557 Intro to Apache OpenNlp
This commit is contained in:
parent
764ccb54a9
commit
db10f66e60
|
@ -0,0 +1,32 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.baeldung</groupId>
|
||||
<artifactId>parent-modules</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>apache-opennlp</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.opennlp</groupId>
|
||||
<artifactId>opennlp-tools</artifactId>
|
||||
<version>1.8.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.assertj</groupId>
|
||||
<artifactId>assertj-core</artifactId>
|
||||
<version>3.9.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,36 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.chunker.ChunkerME;
|
||||
import opennlp.tools.chunker.ChunkerModel;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.postag.POSTaggerME;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Parth
|
||||
*/
|
||||
public class ChunkerTest {
|
||||
|
||||
@Test
|
||||
public void givenSentence_whenChunk_thenGetChunks() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");
|
||||
|
||||
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
|
||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
||||
String tags[] = posTagger.tag(tokens);
|
||||
|
||||
InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
|
||||
ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
|
||||
ChunkerME chunker = new ChunkerME(chunkerModel);
|
||||
String[] chunks = chunker.chunk(tokens, tags);
|
||||
assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import opennlp.tools.langdetect.Language;
|
||||
import opennlp.tools.langdetect.LanguageDetector;
|
||||
import opennlp.tools.langdetect.LanguageDetectorFactory;
|
||||
import opennlp.tools.langdetect.LanguageDetectorME;
|
||||
import opennlp.tools.langdetect.LanguageDetectorModel;
|
||||
import opennlp.tools.langdetect.LanguageDetectorSampleStream;
|
||||
import opennlp.tools.util.InputStreamFactory;
|
||||
import opennlp.tools.util.MarkableFileInputStreamFactory;
|
||||
import opennlp.tools.util.ObjectStream;
|
||||
import opennlp.tools.util.PlainTextByLineStream;
|
||||
import opennlp.tools.util.TrainingParameters;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class LanguageDetectorAndTrainingData {
|
||||
|
||||
@Test
|
||||
public void test() throws FileNotFoundException, IOException {
|
||||
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
|
||||
ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
|
||||
LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
|
||||
TrainingParameters params = new TrainingParameters();
|
||||
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
|
||||
params.put(TrainingParameters.CUTOFF_PARAM, 5);
|
||||
params.put("DataIndexer", "TwoPass");
|
||||
params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
|
||||
|
||||
LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());
|
||||
|
||||
LanguageDetector ld = new LanguageDetectorME(model);
|
||||
Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
|
||||
assertThat(Arrays.asList(languages).toString()).contains("pob (0.9999999950605625)", "ita (4.939427661577956E-9)", "spa (9.665954064665144E-15)",
|
||||
"fra (8.250349924885834E-25)");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.postag.POSTaggerME;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class LemmetizerTest {
|
||||
|
||||
@Test
|
||||
public void givenSentence_whenLemmetize_thenGetLemmas() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
|
||||
|
||||
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
|
||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
||||
String tags[] = posTagger.tag(tokens);
|
||||
InputStream dictLemmatizer = new FileInputStream("src/main/resources/models/en-lemmatizer.dict");
|
||||
DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
|
||||
String[] lemmas = lemmatizer.lemmatize(tokens, tags);
|
||||
|
||||
assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import opennlp.tools.namefind.NameFinderME;
|
||||
import opennlp.tools.namefind.TokenNameFinderModel;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import opennlp.tools.util.Span;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class NamedEntityRecognitionTest {
|
||||
|
||||
@Test
|
||||
public void givenTextWithPersonNames_whenNER_thenGetPersonNamesList() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
|
||||
|
||||
InputStream inputStreamNameFinder = new FileInputStream("src/main/resources/models/en-ner-person.bin");
|
||||
TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
|
||||
NameFinderME nameFinderME = new NameFinderME(model);
|
||||
List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
|
||||
assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
|
||||
List<String> names = new ArrayList<String>();
|
||||
int k = 0;
|
||||
for (Span s : spans) {
|
||||
names.add("");
|
||||
for (int index = s.getStart(); index < s.getEnd(); index++) {
|
||||
names.set(k, names.get(k) + tokens[index]);
|
||||
}
|
||||
k++;
|
||||
}
|
||||
assertThat(names).contains("John","Leonard","Penny");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.postag.POSModel;
|
||||
import opennlp.tools.postag.POSTaggerME;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class POSTaggerTest {
|
||||
|
||||
@Test
|
||||
public void givenSentence_whenPOSTagging_thenGetTags() throws Exception {
|
||||
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
|
||||
|
||||
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
|
||||
POSModel posModel = new POSModel(inputStreamPOSTagger);
|
||||
POSTaggerME posTagger = new POSTaggerME(posModel);
|
||||
String tags[] = posTagger.tag(tokens);
|
||||
assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SentenceDetectionTest {
|
||||
|
||||
@Test
|
||||
public void givenText_whenDetectSent_thenGetSentences() throws Exception {
|
||||
|
||||
String paragraph = "This is a statement. This is another statement. Now is an abstract word for time, "
|
||||
+ "that is always flying. And my email address is google@gmail.com.";
|
||||
|
||||
InputStream is = new FileInputStream("src/main/resources/models/en-sent.bin");
|
||||
SentenceModel model = new SentenceModel(is);
|
||||
|
||||
SentenceDetectorME sdetector = new SentenceDetectorME(model);
|
||||
|
||||
String sentences[] = sdetector.sentDetect(paragraph);
|
||||
assertThat(sentences).contains("This is a statement.",
|
||||
"This is another statement.",
|
||||
"Now is an abstract word for time, that is always flying.",
|
||||
"And my email address is google@gmail.com.");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package com.baeldung.apache.opennlp;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import opennlp.tools.tokenize.SimpleTokenizer;
|
||||
import opennlp.tools.tokenize.TokenizerME;
|
||||
import opennlp.tools.tokenize.TokenizerModel;
|
||||
import opennlp.tools.tokenize.WhitespaceTokenizer;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TokenizerTest {
|
||||
|
||||
@Test
|
||||
public void givenString_whenTokenize_thenGetTokens() throws Exception {
|
||||
FileInputStream fileInputStream = new FileInputStream("src/main/resources/models/en-token.bin");
|
||||
TokenizerModel model = new TokenizerModel(fileInputStream);
|
||||
TokenizerME tokenizer = new TokenizerME(model);
|
||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenString_whenWhitespaceTokenizer_thenGetTokens() throws Exception {
|
||||
WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void givenString_whenSimpleTokenizer_thenGetTokens() throws Exception {
|
||||
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
|
||||
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
|
||||
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue