BAEL-1557 Intro to Apache OpenNlp
This commit is contained in:
		
							parent
							
								
									764ccb54a9
								
							
						
					
					
						commit
						db10f66e60
					
				
							
								
								
									
										32
									
								
								apache-opennlp/pom.xml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								apache-opennlp/pom.xml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,32 @@ | |||||||
|  | <?xml version="1.0" encoding="UTF-8"?> | ||||||
|  | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||||||
|  |     <modelVersion>4.0.0</modelVersion> | ||||||
|  |     <parent> | ||||||
|  |         <groupId>com.baeldung</groupId> | ||||||
|  |         <artifactId>parent-modules</artifactId> | ||||||
|  |         <version>1.0.0-SNAPSHOT</version> | ||||||
|  |     </parent> | ||||||
|  |     <artifactId>apache-opennlp</artifactId> | ||||||
|  |     <version>1.0-SNAPSHOT</version> | ||||||
|  |     <packaging>jar</packaging> | ||||||
|  |      | ||||||
|  |     <dependencies> | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.apache.opennlp</groupId> | ||||||
|  |             <artifactId>opennlp-tools</artifactId> | ||||||
|  |             <version>1.8.4</version> | ||||||
|  |         </dependency> | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>org.assertj</groupId> | ||||||
|  |             <artifactId>assertj-core</artifactId> | ||||||
|  |             <version>3.9.0</version> | ||||||
|  |             <scope>test</scope> | ||||||
|  |         </dependency> | ||||||
|  |         <dependency> | ||||||
|  |             <groupId>junit</groupId> | ||||||
|  |             <artifactId>junit</artifactId> | ||||||
|  |             <version>4.12</version> | ||||||
|  |             <scope>test</scope> | ||||||
|  |         </dependency> | ||||||
|  |     </dependencies> | ||||||
|  | </project> | ||||||
							
								
								
									
										100
									
								
								apache-opennlp/src/main/resources/models/DoccatSample.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								apache-opennlp/src/main/resources/models/DoccatSample.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-chunker.bin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-chunker.bin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										301403
									
								
								apache-opennlp/src/main/resources/models/en-lemmatizer.dict
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										301403
									
								
								apache-opennlp/src/main/resources/models/en-lemmatizer.dict
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-ner-person.bin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-ner-person.bin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-pos-maxent.bin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-pos-maxent.bin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-sent.bin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-sent.bin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-token.bin
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								apache-opennlp/src/main/resources/models/en-token.bin
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @ -0,0 +1,36 @@ | |||||||
|  | package com.baeldung.apache.opennlp; | ||||||
|  | 
 | ||||||
|  | import java.io.FileInputStream; | ||||||
|  | import java.io.InputStream; | ||||||
|  | import opennlp.tools.chunker.ChunkerME; | ||||||
|  | import opennlp.tools.chunker.ChunkerModel; | ||||||
|  | import opennlp.tools.postag.POSModel; | ||||||
|  | import opennlp.tools.postag.POSTaggerME; | ||||||
|  | import opennlp.tools.tokenize.SimpleTokenizer; | ||||||
|  | import static org.assertj.core.api.Assertions.assertThat; | ||||||
|  | import org.junit.Test; | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * | ||||||
|  |  * @author Parth | ||||||
|  |  */ | ||||||
|  | public class ChunkerTest { | ||||||
|  | 
 | ||||||
|  |     @Test | ||||||
|  |     public void givenSentence_whenChunk_thenGetChunks() throws Exception { | ||||||
|  | 
 | ||||||
|  |         SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; | ||||||
|  |         String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion."); | ||||||
|  | 
 | ||||||
|  |         InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin"); | ||||||
|  |         POSModel posModel = new POSModel(inputStreamPOSTagger); | ||||||
|  |         POSTaggerME posTagger = new POSTaggerME(posModel); | ||||||
|  |         String tags[] = posTagger.tag(tokens); | ||||||
|  | 
 | ||||||
|  |         InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin"); | ||||||
|  |         ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker); | ||||||
|  |         ChunkerME chunker = new ChunkerME(chunkerModel); | ||||||
|  |         String[] chunks = chunker.chunk(tokens, tags); | ||||||
|  |         assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O"); | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -0,0 +1,41 @@ | |||||||
|  | package com.baeldung.apache.opennlp; | ||||||
|  | 
 | ||||||
|  | import java.io.File; | ||||||
|  | import java.io.FileNotFoundException; | ||||||
|  | import java.io.IOException; | ||||||
|  | import java.util.Arrays; | ||||||
|  | import opennlp.tools.langdetect.Language; | ||||||
|  | import opennlp.tools.langdetect.LanguageDetector; | ||||||
|  | import opennlp.tools.langdetect.LanguageDetectorFactory; | ||||||
|  | import opennlp.tools.langdetect.LanguageDetectorME; | ||||||
|  | import opennlp.tools.langdetect.LanguageDetectorModel; | ||||||
|  | import opennlp.tools.langdetect.LanguageDetectorSampleStream; | ||||||
|  | import opennlp.tools.util.InputStreamFactory; | ||||||
|  | import opennlp.tools.util.MarkableFileInputStreamFactory; | ||||||
|  | import opennlp.tools.util.ObjectStream; | ||||||
|  | import opennlp.tools.util.PlainTextByLineStream; | ||||||
|  | import opennlp.tools.util.TrainingParameters; | ||||||
|  | import static org.assertj.core.api.Assertions.assertThat; | ||||||
|  | import org.junit.Test; | ||||||
|  | 
 | ||||||
|  | public class LanguageDetectorAndTrainingData { | ||||||
|  | 
 | ||||||
|  |     @Test | ||||||
|  |     public void test() throws FileNotFoundException, IOException { | ||||||
|  |         InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt")); | ||||||
|  |         ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8"); | ||||||
|  |         LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream); | ||||||
|  |         TrainingParameters params = new TrainingParameters(); | ||||||
|  |         params.put(TrainingParameters.ITERATIONS_PARAM, 100); | ||||||
|  |         params.put(TrainingParameters.CUTOFF_PARAM, 5); | ||||||
|  |         params.put("DataIndexer", "TwoPass"); | ||||||
|  |         params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES"); | ||||||
|  | 
 | ||||||
|  |         LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory()); | ||||||
|  | 
 | ||||||
|  |         LanguageDetector ld = new LanguageDetectorME(model); | ||||||
|  |         Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno"); | ||||||
|  |         assertThat(Arrays.asList(languages).toString()).contains("pob (0.9999999950605625)", "ita (4.939427661577956E-9)", "spa (9.665954064665144E-15)",  | ||||||
|  |                 "fra (8.250349924885834E-25)"); | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -0,0 +1,31 @@ | |||||||
|  | package com.baeldung.apache.opennlp; | ||||||
|  | 
 | ||||||
|  | import java.io.File; | ||||||
|  | import java.io.FileInputStream; | ||||||
|  | import java.io.InputStream; | ||||||
|  | import opennlp.tools.lemmatizer.DictionaryLemmatizer; | ||||||
|  | import opennlp.tools.postag.POSModel; | ||||||
|  | import opennlp.tools.postag.POSTaggerME; | ||||||
|  | import opennlp.tools.tokenize.SimpleTokenizer; | ||||||
|  | import static org.assertj.core.api.Assertions.assertThat; | ||||||
|  | import org.junit.Test; | ||||||
|  | 
 | ||||||
|  | public class LemmetizerTest { | ||||||
|  | 
 | ||||||
|  |     @Test | ||||||
|  |     public void givenSentence_whenLemmetize_thenGetLemmas() throws Exception { | ||||||
|  | 
 | ||||||
|  |         SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; | ||||||
|  |         String[] tokens = tokenizer.tokenize("John has a sister named Penny."); | ||||||
|  | 
 | ||||||
|  |         InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin"); | ||||||
|  |         POSModel posModel = new POSModel(inputStreamPOSTagger); | ||||||
|  |         POSTaggerME posTagger = new POSTaggerME(posModel); | ||||||
|  |         String tags[] = posTagger.tag(tokens); | ||||||
|  |         InputStream dictLemmatizer = new FileInputStream("src/main/resources/models/en-lemmatizer.dict"); | ||||||
|  |         DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer); | ||||||
|  |         String[] lemmas = lemmatizer.lemmatize(tokens, tags); | ||||||
|  | 
 | ||||||
|  |         assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O"); | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -0,0 +1,40 @@ | |||||||
|  | package com.baeldung.apache.opennlp; | ||||||
|  | 
 | ||||||
|  | import java.io.FileInputStream; | ||||||
|  | import java.io.InputStream; | ||||||
|  | import java.util.ArrayList; | ||||||
|  | import java.util.Arrays; | ||||||
|  | import java.util.List; | ||||||
|  | import opennlp.tools.namefind.NameFinderME; | ||||||
|  | import opennlp.tools.namefind.TokenNameFinderModel; | ||||||
|  | import opennlp.tools.tokenize.SimpleTokenizer; | ||||||
|  | import opennlp.tools.util.Span; | ||||||
|  | import static org.assertj.core.api.Assertions.assertThat; | ||||||
|  | import org.junit.Test; | ||||||
|  | 
 | ||||||
|  | public class NamedEntityRecognitionTest { | ||||||
|  | 
 | ||||||
|  |     @Test | ||||||
|  |     public void givenTextWithPersonNames_whenNER_thenGetPersonNamesList() throws Exception { | ||||||
|  |          | ||||||
|  |         SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; | ||||||
|  |         String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny."); | ||||||
|  |          | ||||||
|  |         InputStream inputStreamNameFinder = new FileInputStream("src/main/resources/models/en-ner-person.bin"); | ||||||
|  |         TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder); | ||||||
|  |         NameFinderME nameFinderME = new NameFinderME(model); | ||||||
|  |         List<Span> spans = Arrays.asList(nameFinderME.find(tokens)); | ||||||
|  |         assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]"); | ||||||
|  |         List<String> names = new ArrayList<String>(); | ||||||
|  |         int k = 0; | ||||||
|  |         for (Span s : spans) { | ||||||
|  |             names.add(""); | ||||||
|  |             for (int index = s.getStart(); index < s.getEnd(); index++) { | ||||||
|  |                 names.set(k, names.get(k) + tokens[index]); | ||||||
|  |             } | ||||||
|  |             k++; | ||||||
|  |         } | ||||||
|  |         assertThat(names).contains("John","Leonard","Penny"); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | } | ||||||
| @ -0,0 +1,25 @@ | |||||||
|  | package com.baeldung.apache.opennlp; | ||||||
|  | 
 | ||||||
|  | import java.io.FileInputStream; | ||||||
|  | import java.io.InputStream; | ||||||
|  | import opennlp.tools.postag.POSModel; | ||||||
|  | import opennlp.tools.postag.POSTaggerME; | ||||||
|  | import opennlp.tools.tokenize.SimpleTokenizer; | ||||||
|  | import static org.assertj.core.api.Assertions.assertThat; | ||||||
|  | import org.junit.Test; | ||||||
|  | 
 | ||||||
|  | public class POSTaggerTest { | ||||||
|  | 
 | ||||||
|  |     @Test | ||||||
|  |     public void givenSentence_whenPOSTagging_thenGetTags() throws Exception { | ||||||
|  | 
 | ||||||
|  |         SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; | ||||||
|  |         String[] tokens = tokenizer.tokenize("John has a sister named Penny."); | ||||||
|  | 
 | ||||||
|  |         InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin"); | ||||||
|  |         POSModel posModel = new POSModel(inputStreamPOSTagger); | ||||||
|  |         POSTaggerME posTagger = new POSTaggerME(posModel); | ||||||
|  |         String tags[] = posTagger.tag(tokens); | ||||||
|  |         assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", "."); | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -0,0 +1,29 @@ | |||||||
|  | package com.baeldung.apache.opennlp; | ||||||
|  | 
 | ||||||
|  | import java.io.FileInputStream; | ||||||
|  | import java.io.InputStream; | ||||||
|  | import opennlp.tools.sentdetect.SentenceDetectorME; | ||||||
|  | import opennlp.tools.sentdetect.SentenceModel; | ||||||
|  | import static org.assertj.core.api.Assertions.assertThat; | ||||||
|  | import org.junit.Test; | ||||||
|  | 
 | ||||||
|  | public class SentenceDetectionTest { | ||||||
|  | 
 | ||||||
|  |     @Test | ||||||
|  |     public void givenText_whenDetectSent_thenGetSentences() throws Exception { | ||||||
|  | 
 | ||||||
|  |         String paragraph = "This is a statement. This is another statement. Now is an abstract word for time, " | ||||||
|  |                 + "that is always flying. And my email address is google@gmail.com."; | ||||||
|  | 
 | ||||||
|  |         InputStream is = new FileInputStream("src/main/resources/models/en-sent.bin"); | ||||||
|  |         SentenceModel model = new SentenceModel(is); | ||||||
|  | 
 | ||||||
|  |         SentenceDetectorME sdetector = new SentenceDetectorME(model); | ||||||
|  | 
 | ||||||
|  |         String sentences[] = sdetector.sentDetect(paragraph); | ||||||
|  |         assertThat(sentences).contains("This is a statement.", | ||||||
|  |                 "This is another statement.", | ||||||
|  |                 "Now is an abstract word for time, that is always flying.", | ||||||
|  |                 "And my email address is google@gmail.com."); | ||||||
|  |     } | ||||||
|  | } | ||||||
| @ -0,0 +1,36 @@ | |||||||
|  | package com.baeldung.apache.opennlp; | ||||||
|  | 
 | ||||||
|  | import java.io.FileInputStream; | ||||||
|  | import opennlp.tools.tokenize.SimpleTokenizer; | ||||||
|  | import opennlp.tools.tokenize.TokenizerME; | ||||||
|  | import opennlp.tools.tokenize.TokenizerModel; | ||||||
|  | import opennlp.tools.tokenize.WhitespaceTokenizer; | ||||||
|  | import static org.assertj.core.api.Assertions.assertThat; | ||||||
|  | import org.junit.Test; | ||||||
|  | 
 | ||||||
|  | public class TokenizerTest { | ||||||
|  | 
 | ||||||
|  |     @Test | ||||||
|  |     public void givenString_whenTokenize_thenGetTokens() throws Exception { | ||||||
|  |          FileInputStream fileInputStream = new FileInputStream("src/main/resources/models/en-token.bin"); | ||||||
|  |         TokenizerModel model = new TokenizerModel(fileInputStream); | ||||||
|  |         TokenizerME tokenizer = new TokenizerME(model); | ||||||
|  |         String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource."); | ||||||
|  |         assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", "."); | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     @Test | ||||||
|  |     public void givenString_whenWhitespaceTokenizer_thenGetTokens() throws Exception { | ||||||
|  |         WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE; | ||||||
|  |         String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource."); | ||||||
|  |         assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource."); | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     @Test | ||||||
|  |     public void givenString_whenSimpleTokenizer_thenGetTokens() throws Exception { | ||||||
|  |         SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; | ||||||
|  |         String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource."); | ||||||
|  |         assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", "."); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | } | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user