lucene analyzers

2018-08-25 20:16:30 +03:00 · 2018-08-25 20:16:30 +03:00 · 1613d21656
commit 1613d21656
parent 3f1518b151
4 changed files with 183 additions and 4 deletions
--- a/lucene/pom.xml
+++ b/lucene/pom.xml
@ -23,6 +23,12 @@
            <artifactId>lucene-queryparser</artifactId>
            <version>${lucene.version}</version>
        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+        
    </dependencies>

    <properties>
@ -32,7 +38,7 @@
        <hibernate-jpa-2.1-api.version>1.0.0.Final</hibernate-jpa-2.1-api.version>
        <!-- delombok maven plugin -->
        <delombok-maven-plugin.version>1.16.10.0</delombok-maven-plugin.version>
-        <lucene.version>7.1.0</lucene.version>
+        <lucene.version>7.4.0</lucene.version>
    </properties>

 </project>
--- a/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java
+++ b/lucene/src/main/java/com/baeldung/lucene/InMemoryLuceneIndex.java
@ -4,7 +4,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

-import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.SortedDocValuesField;
@ -27,9 +27,9 @@ import org.apache.lucene.util.BytesRef;
 public class InMemoryLuceneIndex {

    private Directory memoryIndex;
-    private StandardAnalyzer analyzer;
+    private Analyzer analyzer;

-    public InMemoryLuceneIndex(Directory memoryIndex, StandardAnalyzer analyzer) {
+    public InMemoryLuceneIndex(Directory memoryIndex, Analyzer analyzer) {
        super();
        this.memoryIndex = memoryIndex;
        this.analyzer = analyzer;
--- a/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java
+++ b/lucene/src/main/java/com/baeldung/lucene/MyCustomAnalyzer.java
@ -0,0 +1,26 @@
+package com.baeldung.lucene;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+public class MyCustomAnalyzer extends Analyzer{
+
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName) {
+        final StandardTokenizer src = new StandardTokenizer();
+        TokenStream result = new StandardFilter(src);
+        result = new LowerCaseFilter(result);
+        result = new StopFilter(result,  StandardAnalyzer.STOP_WORDS_SET);
+        result = new PorterStemFilter(result);
+        result = new CapitalizationFilter(result);
+        return new TokenStreamComponents(src, result);
+    }
+
+}
--- a/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java
+++ b/lucene/src/test/java/com/baeldung/lucene/LuceneAnalyzerIntegrationTest.java
@ -0,0 +1,147 @@
+package com.baeldung.lucene;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.collection.IsIterableContainingInOrder.contains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.core.SimpleAnalyzer;
+import org.apache.lucene.analysis.core.StopAnalyzer;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.RAMDirectory;
+import org.junit.Test;
+
+public class LuceneAnalyzerIntegrationTest {
+
+    private static final String SAMPLE_TEXT = "This is baeldung.com Lucene Analyzers test";
+    private static final String FIELD_NAME = "sampleName";
+
+    @Test
+    public void whenUseStandardAnalyzer_thenAnalyzed() throws IOException {
+        List<String> result = analyze(SAMPLE_TEXT, new StandardAnalyzer());
+
+        assertThat(result, contains("baeldung.com", "lucene", "analyzers", "test"));
+    }
+
+    @Test
+    public void whenUseStopAnalyzer_thenAnalyzed() throws IOException {
+        List<String> result = analyze(SAMPLE_TEXT, new StopAnalyzer());
+
+        assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test"));
+    }
+
+    @Test
+    public void whenUseSimpleAnalyzer_thenAnalyzed() throws IOException {
+        List<String> result = analyze(SAMPLE_TEXT, new SimpleAnalyzer());
+
+        assertThat(result, contains("this", "is", "baeldung", "com", "lucene", "analyzers", "test"));
+    }
+
+    @Test
+    public void whenUseWhiteSpaceAnalyzer_thenAnalyzed() throws IOException {
+        List<String> result = analyze(SAMPLE_TEXT, new WhitespaceAnalyzer());
+
+        assertThat(result, contains("This", "is", "baeldung.com", "Lucene", "Analyzers", "test"));
+    }
+
+    @Test
+    public void whenUseKeywordAnalyzer_thenAnalyzed() throws IOException {
+        List<String> result = analyze(SAMPLE_TEXT, new KeywordAnalyzer());
+
+        assertThat(result, contains("This is baeldung.com Lucene Analyzers test"));
+    }
+
+    @Test
+    public void whenUseEnglishAnalyzer_thenAnalyzed() throws IOException {
+        List<String> result = analyze(SAMPLE_TEXT, new EnglishAnalyzer());
+
+        assertThat(result, contains("baeldung.com", "lucen", "analyz", "test"));
+    }
+
+    @Test
+    public void whenUseCustomAnalyzerBuilder_thenAnalyzed() throws IOException {
+        Analyzer analyzer = CustomAnalyzer.builder()
+            .withTokenizer("standard")
+            .addTokenFilter("lowercase")
+            .addTokenFilter("stop")
+            .addTokenFilter("porterstem")
+            .addTokenFilter("capitalization")
+            .build();
+        List<String> result = analyze(SAMPLE_TEXT, analyzer);
+
+        assertThat(result, contains("Baeldung.com", "Lucen", "Analyz", "Test"));
+    }
+
+    @Test
+    public void whenUseCustomAnalyzer_thenAnalyzed() throws IOException {
+        List<String> result = analyze(SAMPLE_TEXT, new MyCustomAnalyzer());
+
+        assertThat(result, contains("Baeldung.com", "Lucen", "Analyz", "Test"));
+    }
+
+    // ================= usage example
+    
+    @Test
+    public void givenTermQuery_whenUseCustomAnalyzer_thenCorrect() {
+        InMemoryLuceneIndex luceneIndex = new InMemoryLuceneIndex(new RAMDirectory(), new MyCustomAnalyzer());
+        luceneIndex.indexDocument("introduction", "introduction to lucene");
+        luceneIndex.indexDocument("analyzers", "guide to lucene analyzers");
+        Query query = new TermQuery(new Term("body", "Introduct"));
+
+        List<Document> documents = luceneIndex.searchIndex(query);
+        assertEquals(1, documents.size());
+    }
+    
+    @Test
+    public void givenTermQuery_whenUsePerFieldAnalyzerWrapper_thenCorrect() {
+        Map<String,Analyzer> analyzerMap = new HashMap<>();
+        analyzerMap.put("title", new MyCustomAnalyzer());
+        analyzerMap.put("body", new EnglishAnalyzer());
+
+        PerFieldAnalyzerWrapper wrapper =
+          new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerMap);
+        InMemoryLuceneIndex luceneIndex = new InMemoryLuceneIndex(new RAMDirectory(), wrapper);
+        luceneIndex.indexDocument("introduction", "introduction to lucene");
+        luceneIndex.indexDocument("analyzers", "guide to lucene analyzers");
+        
+        Query query = new TermQuery(new Term("body", "introduct"));
+        List<Document> documents = luceneIndex.searchIndex(query);
+        assertEquals(1, documents.size());
+        
+        query = new TermQuery(new Term("title", "Introduct"));
+
+        documents = luceneIndex.searchIndex(query);
+        assertEquals(1, documents.size());
+    }
+
+    // ===================================================================
+
+    public List<String> analyze(String text, Analyzer analyzer) throws IOException {
+        List<String> result = new ArrayList<String>();
+        TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+        CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
+        tokenStream.reset();
+        while (tokenStream.incrementToken()) {
+            result.add(attr.toString());
+        }
+        return result;
+    }
+
+}