diff --git a/lucene/pom.xml b/lucene/pom.xml index a3960f6059..f427cfd8a7 100644 --- a/lucene/pom.xml +++ b/lucene/pom.xml @@ -23,6 +23,12 @@ lucene-queryparser ${lucene.version} + + org.apache.lucene + lucene-analyzers-common + ${lucene.version} + + @@ -32,7 +38,7 @@ 1.0.0.Final 1.16.10.0 - 7.1.0 + 7.4.0 \ No newline at end of file diff --git a/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java b/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java index 97b1ec7b5d..8a31d3cb5b 100644 --- a/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java +++ b/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java @@ -4,7 +4,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedDocValuesField; @@ -27,9 +27,9 @@ import org.apache.lucene.util.BytesRef; public class InMemoryLuceneIndex { private Directory memoryIndex; - private StandardAnalyzer analyzer; + private Analyzer analyzer; - public InMemoryLuceneIndex(Directory memoryIndex, StandardAnalyzer analyzer) { + public InMemoryLuceneIndex(Directory memoryIndex, Analyzer analyzer) { super(); this.memoryIndex = memoryIndex; this.analyzer = analyzer; diff --git a/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java b/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java new file mode 100644 index 0000000000..609e2d09d3 --- /dev/null +++ b/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java @@ -0,0 +1,26 @@ +package com.baeldung.lucene; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +public class MyCustomAnalyzer extends Analyzer{ + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + final StandardTokenizer src = new StandardTokenizer(); + TokenStream result = new StandardFilter(src); + result = new LowerCaseFilter(result); + result = new StopFilter(result, StandardAnalyzer.STOP_WORDS_SET); + result = new PorterStemFilter(result); + result = new CapitalizationFilter(result); + return new TokenStreamComponents(src, result); + } + +} diff --git a/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java b/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java new file mode 100644 index 0000000000..28a87bba8c --- /dev/null +++ b/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java @@ -0,0 +1,147 @@ +package com.baeldung.lucene; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.collection.IsIterableContainingInOrder.contains; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +public class LuceneAnalyzerIntegrationTest { + + private static final String SAMPLE_TEXT = "This is baeldung.com Lucene Analyzers test"; + private static final String FIELD_NAME = "sampleName"; + + @Test + public void whenUseStandardAnalyzer_thenAnalyzed() throws IOException { + List result = analyze(SAMPLE_TEXT, new StandardAnalyzer()); + + assertThat(result, contains("baeldung.com", "lucene", "analyzers", "test")); + } + + @Test + public void whenUseStopAnalyzer_thenAnalyzed() throws IOException { + List result = analyze(SAMPLE_TEXT, new StopAnalyzer()); + + assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test")); + } + + @Test + public void whenUseSimpleAnalyzer_thenAnalyzed() throws IOException { + List result = analyze(SAMPLE_TEXT, new SimpleAnalyzer()); + + assertThat(result, contains("this", "is", "baeldung", "com", "lucene", "analyzers", "test")); + } + + @Test + public void whenUseWhiteSpaceAnalyzer_thenAnalyzed() throws IOException { + List result = analyze(SAMPLE_TEXT, new WhitespaceAnalyzer()); + + assertThat(result, contains("This", "is", "baeldung.com", "Lucene", "Analyzers", "test")); + } + + @Test + public void whenUseKeywordAnalyzer_thenAnalyzed() throws IOException { + List result = analyze(SAMPLE_TEXT, new KeywordAnalyzer()); + + assertThat(result, contains("This is baeldung.com Lucene Analyzers test")); + } + + @Test + public void whenUseEnglishAnalyzer_thenAnalyzed() throws IOException { + List result = analyze(SAMPLE_TEXT, new EnglishAnalyzer()); + + assertThat(result, contains("baeldung.com", "lucen", "analyz", "test")); + } + + @Test + public void whenUseCustomAnalyzerBuilder_thenAnalyzed() throws IOException { + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer("standard") + .addTokenFilter("lowercase") + .addTokenFilter("stop") + .addTokenFilter("porterstem") + .addTokenFilter("capitalization") + .build(); + List result = analyze(SAMPLE_TEXT, analyzer); + + assertThat(result, contains("Baeldung.com", "Lucen", "Analyz", "Test")); + } + + @Test + public void whenUseCustomAnalyzer_thenAnalyzed() throws IOException { + List result = analyze(SAMPLE_TEXT, new MyCustomAnalyzer()); + + assertThat(result, contains("Baeldung.com", "Lucen", "Analyz", "Test")); + } + + // ================= usage example + + @Test + public void givenTermQuery_whenUseCustomAnalyzer_thenCorrect() { + InMemoryLuceneIndex luceneIndex = new InMemoryLuceneIndex(new RAMDirectory(), new MyCustomAnalyzer()); + luceneIndex.indexDocument("introduction", "introduction to lucene"); + luceneIndex.indexDocument("analyzers", "guide to lucene analyzers"); + Query query = new TermQuery(new Term("body", "Introduct")); + + List documents = luceneIndex.searchIndex(query); + assertEquals(1, documents.size()); + } + + @Test + public void givenTermQuery_whenUsePerFieldAnalyzerWrapper_thenCorrect() { + Map analyzerMap = new HashMap<>(); + analyzerMap.put("title", new MyCustomAnalyzer()); + analyzerMap.put("body", new EnglishAnalyzer()); + + PerFieldAnalyzerWrapper wrapper = + new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerMap); + InMemoryLuceneIndex luceneIndex = new InMemoryLuceneIndex(new RAMDirectory(), wrapper); + luceneIndex.indexDocument("introduction", "introduction to lucene"); + luceneIndex.indexDocument("analyzers", "guide to lucene analyzers"); + + Query query = new TermQuery(new Term("body", "introduct")); + List documents = luceneIndex.searchIndex(query); + assertEquals(1, documents.size()); + + query = new TermQuery(new Term("title", "Introduct")); + + documents = luceneIndex.searchIndex(query); + assertEquals(1, documents.size()); + } + + // =================================================================== + + public List analyze(String text, Analyzer analyzer) throws IOException { + List result = new ArrayList(); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text); + CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + result.add(attr.toString()); + } + return result; + } + +}