diff --git a/lucene/pom.xml b/lucene/pom.xml
index a3960f6059..f427cfd8a7 100644
--- a/lucene/pom.xml
+++ b/lucene/pom.xml
@@ -23,6 +23,12 @@
lucene-queryparser
${lucene.version}
+
+ org.apache.lucene
+ lucene-analyzers-common
+ ${lucene.version}
+
+
@@ -32,7 +38,7 @@
1.0.0.Final
1.16.10.0
- 7.1.0
+ 7.4.0
\ No newline at end of file
diff --git a/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java b/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java
index 97b1ec7b5d..8a31d3cb5b 100644
--- a/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java
+++ b/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java
@@ -4,7 +4,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
@@ -27,9 +27,9 @@ import org.apache.lucene.util.BytesRef;
public class InMemoryLuceneIndex {
private Directory memoryIndex;
- private StandardAnalyzer analyzer;
+ private Analyzer analyzer;
- public InMemoryLuceneIndex(Directory memoryIndex, StandardAnalyzer analyzer) {
+ public InMemoryLuceneIndex(Directory memoryIndex, Analyzer analyzer) {
super();
this.memoryIndex = memoryIndex;
this.analyzer = analyzer;
diff --git a/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java b/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java
new file mode 100644
index 0000000000..609e2d09d3
--- /dev/null
+++ b/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java
@@ -0,0 +1,26 @@
+package com.baeldung.lucene;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+public class MyCustomAnalyzer extends Analyzer{
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final StandardTokenizer src = new StandardTokenizer();
+ TokenStream result = new StandardFilter(src);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(result, StandardAnalyzer.STOP_WORDS_SET);
+ result = new PorterStemFilter(result);
+ result = new CapitalizationFilter(result);
+ return new TokenStreamComponents(src, result);
+ }
+
+}
diff --git a/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java b/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java
new file mode 100644
index 0000000000..28a87bba8c
--- /dev/null
+++ b/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java
@@ -0,0 +1,147 @@
+package com.baeldung.lucene;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.collection.IsIterableContainingInOrder.contains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.core.SimpleAnalyzer;
+import org.apache.lucene.analysis.core.StopAnalyzer;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.RAMDirectory;
+import org.junit.Test;
+
+public class LuceneAnalyzerIntegrationTest {
+
+ private static final String SAMPLE_TEXT = "This is baeldung.com Lucene Analyzers test";
+ private static final String FIELD_NAME = "sampleName";
+
+ @Test
+ public void whenUseStandardAnalyzer_thenAnalyzed() throws IOException {
+ List result = analyze(SAMPLE_TEXT, new StandardAnalyzer());
+
+ assertThat(result, contains("baeldung.com", "lucene", "analyzers", "test"));
+ }
+
+ @Test
+ public void whenUseStopAnalyzer_thenAnalyzed() throws IOException {
+ List result = analyze(SAMPLE_TEXT, new StopAnalyzer());
+
+ assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test"));
+ }
+
+ @Test
+ public void whenUseSimpleAnalyzer_thenAnalyzed() throws IOException {
+ List result = analyze(SAMPLE_TEXT, new SimpleAnalyzer());
+
+ assertThat(result, contains("this", "is", "baeldung", "com", "lucene", "analyzers", "test"));
+ }
+
+ @Test
+ public void whenUseWhiteSpaceAnalyzer_thenAnalyzed() throws IOException {
+ List result = analyze(SAMPLE_TEXT, new WhitespaceAnalyzer());
+
+ assertThat(result, contains("This", "is", "baeldung.com", "Lucene", "Analyzers", "test"));
+ }
+
+ @Test
+ public void whenUseKeywordAnalyzer_thenAnalyzed() throws IOException {
+ List result = analyze(SAMPLE_TEXT, new KeywordAnalyzer());
+
+ assertThat(result, contains("This is baeldung.com Lucene Analyzers test"));
+ }
+
+ @Test
+ public void whenUseEnglishAnalyzer_thenAnalyzed() throws IOException {
+ List result = analyze(SAMPLE_TEXT, new EnglishAnalyzer());
+
+ assertThat(result, contains("baeldung.com", "lucen", "analyz", "test"));
+ }
+
+ @Test
+ public void whenUseCustomAnalyzerBuilder_thenAnalyzed() throws IOException {
+ Analyzer analyzer = CustomAnalyzer.builder()
+ .withTokenizer("standard")
+ .addTokenFilter("lowercase")
+ .addTokenFilter("stop")
+ .addTokenFilter("porterstem")
+ .addTokenFilter("capitalization")
+ .build();
+ List result = analyze(SAMPLE_TEXT, analyzer);
+
+ assertThat(result, contains("Baeldung.com", "Lucen", "Analyz", "Test"));
+ }
+
+ @Test
+ public void whenUseCustomAnalyzer_thenAnalyzed() throws IOException {
+ List result = analyze(SAMPLE_TEXT, new MyCustomAnalyzer());
+
+ assertThat(result, contains("Baeldung.com", "Lucen", "Analyz", "Test"));
+ }
+
+ // ================= usage example
+
+ @Test
+ public void givenTermQuery_whenUseCustomAnalyzer_thenCorrect() {
+ InMemoryLuceneIndex luceneIndex = new InMemoryLuceneIndex(new RAMDirectory(), new MyCustomAnalyzer());
+ luceneIndex.indexDocument("introduction", "introduction to lucene");
+ luceneIndex.indexDocument("analyzers", "guide to lucene analyzers");
+ Query query = new TermQuery(new Term("body", "Introduct"));
+
+ List documents = luceneIndex.searchIndex(query);
+ assertEquals(1, documents.size());
+ }
+
+ @Test
+ public void givenTermQuery_whenUsePerFieldAnalyzerWrapper_thenCorrect() {
+ Map analyzerMap = new HashMap<>();
+ analyzerMap.put("title", new MyCustomAnalyzer());
+ analyzerMap.put("body", new EnglishAnalyzer());
+
+ PerFieldAnalyzerWrapper wrapper =
+ new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerMap);
+ InMemoryLuceneIndex luceneIndex = new InMemoryLuceneIndex(new RAMDirectory(), wrapper);
+ luceneIndex.indexDocument("introduction", "introduction to lucene");
+ luceneIndex.indexDocument("analyzers", "guide to lucene analyzers");
+
+ Query query = new TermQuery(new Term("body", "introduct"));
+ List documents = luceneIndex.searchIndex(query);
+ assertEquals(1, documents.size());
+
+ query = new TermQuery(new Term("title", "Introduct"));
+
+ documents = luceneIndex.searchIndex(query);
+ assertEquals(1, documents.size());
+ }
+
+ // ===================================================================
+
+ public List analyze(String text, Analyzer analyzer) throws IOException {
+ List result = new ArrayList();
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
+ tokenStream.reset();
+ while (tokenStream.incrementToken()) {
+ result.add(attr.toString());
+ }
+ return result;
+ }
+
+}