Analysis: Add `char_filter` on top of `tokenizer`, `filter`, and `analyzer`. Add an `html_strip` char filter, closes #315.

2010-08-12 18:16:30 +03:00 · 2010-08-12 18:16:30 +03:00 · 98bc8285ea
parent e29925684a
commit 98bc8285ea
13 changed files with 1695 additions and 10 deletions
--- a/.idea/dictionaries/kimchy.xml
+++ b/.idea/dictionaries/kimchy.xml
@ -19,6 +19,7 @@
      <w>calc</w>
      <w>camelcase</w>
      <w>canonicalhost</w>
+      <w>charfilter</w>
      <w>checksum</w>
      <w>chunking</w>
      <w>closeable</w>
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/analysis/HTMLStripCharFilter.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/analysis/HTMLStripCharFilter.java
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AbstractCharFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AbstractCharFilterFactory.java
@ -0,0 +1,43 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.AbstractIndexComponent;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+/**
+ * @author kimchy (shay.banon)
+ */
+public abstract class AbstractCharFilterFactory extends AbstractIndexComponent implements CharFilterFactory {
+
+    private final String name;
+
+    public AbstractCharFilterFactory(Index index, @IndexSettings Settings indexSettings, String name) {
+        super(index, indexSettings);
+        this.name = name;
+    }
+
+    @Override public String name() {
+        return this.name;
+    }
+}
+
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@ -37,6 +37,34 @@ public class AnalysisModule extends AbstractModule {

    public static class AnalysisBinderProcessor {

+        public void processCharFilters(CharFiltersBindings charFiltersBindings) {
+
+        }
+
+        public static class CharFiltersBindings {
+            private final MapBinder<String, CharFilterFactoryFactory> binder;
+            private final Map<String, Settings> groupSettings;
+
+            public CharFiltersBindings(MapBinder<String, CharFilterFactoryFactory> binder, Map<String, Settings> groupSettings) {
+                this.binder = binder;
+                this.groupSettings = groupSettings;
+            }
+
+            public MapBinder<String, CharFilterFactoryFactory> binder() {
+                return binder;
+            }
+
+            public Map<String, Settings> groupSettings() {
+                return groupSettings;
+            }
+
+            public void processCharFilter(String name, Class<? extends CharFilterFactory> charFilterFactory) {
+                if (!groupSettings.containsKey(name)) {
+                    binder.addBinding(name).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, charFilterFactory)).in(Scopes.SINGLETON);
+                }
+            }
+        }
+
        public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {

        }
@ -159,6 +187,27 @@ public class AnalysisModule extends AbstractModule {
    }

    @Override protected void configure() {
+        MapBinder<String, CharFilterFactoryFactory> charFilterBinder
+                = MapBinder.newMapBinder(binder(), String.class, CharFilterFactoryFactory.class);
+
+        Map<String, Settings> charFiltersSettings = settings.getGroups("index.analysis.char_filter");
+        for (Map.Entry<String, Settings> entry : charFiltersSettings.entrySet()) {
+            String charFilterName = entry.getKey();
+            Settings charFilterSettings = entry.getValue();
+
+            Class<? extends CharFilterFactory> type = charFilterSettings.getAsClass("type", null, "org.elasticsearch.index.analysis.", "CharFilterFactory");
+            if (type == null) {
+                throw new IllegalArgumentException("Char Filter [" + charFilterName + "] must have a type associated with it");
+            }
+            charFilterBinder.addBinding(charFilterName).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, type)).in(Scopes.SINGLETON);
+        }
+
+        AnalysisBinderProcessor.CharFiltersBindings charFiltersBindings = new AnalysisBinderProcessor.CharFiltersBindings(charFilterBinder, charFiltersSettings);
+        for (AnalysisBinderProcessor processor : processors) {
+            processor.processCharFilters(charFiltersBindings);
+        }
+
+
        MapBinder<String, TokenFilterFactoryFactory> tokenFilterBinder
                = MapBinder.newMapBinder(binder(), String.class, TokenFilterFactoryFactory.class);

@ -230,6 +279,11 @@ public class AnalysisModule extends AbstractModule {

    private static class DefaultProcessor extends AnalysisBinderProcessor {

+        @Override public void processCharFilters(CharFiltersBindings charFiltersBindings) {
+            charFiltersBindings.processCharFilter("html_strip", HtmlStripCharFilterFactory.class);
+            charFiltersBindings.processCharFilter("htmlStrip", HtmlStripCharFilterFactory.class);
+        }
+
        @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
            tokenFiltersBindings.processTokenFilter("stop", StopTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("asciifolding", ASCIIFoldingTokenFilterFactory.class);
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java
@ -42,15 +42,18 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable

    private final ImmutableMap<String, TokenizerFactory> tokenizers;

+    private final ImmutableMap<String, CharFilterFactory> charFilters;
+
    private final ImmutableMap<String, TokenFilterFactory> tokenFilters;

    public AnalysisService(Index index) {
-        this(index, ImmutableSettings.Builder.EMPTY_SETTINGS, null, null, null);
+        this(index, ImmutableSettings.Builder.EMPTY_SETTINGS, null, null, null, null);
    }

    @Inject public AnalysisService(Index index, @IndexSettings Settings indexSettings,
                                   @Nullable Map<String, AnalyzerProviderFactory> analyzerFactoryFactories,
                                   @Nullable Map<String, TokenizerFactoryFactory> tokenizerFactoryFactories,
+                                   @Nullable Map<String, CharFilterFactoryFactory> charFilterFactoryFactories,
                                   @Nullable Map<String, TokenFilterFactoryFactory> tokenFilterFactoryFactories) {
        super(index, indexSettings);

@ -105,6 +108,24 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
        }
        this.tokenizers = ImmutableMap.copyOf(tokenizers);

+        Map<String, CharFilterFactory> charFilters = newHashMap();
+        if (charFilterFactoryFactories != null) {
+            Map<String, Settings> charFiltersSettings = indexSettings.getGroups("index.analysis.char_filter");
+            for (Map.Entry<String, CharFilterFactoryFactory> entry : charFilterFactoryFactories.entrySet()) {
+                String charFilterName = entry.getKey();
+                CharFilterFactoryFactory charFilterFactoryFactory = entry.getValue();
+
+                Settings charFilterSettings = charFiltersSettings.get(charFilterName);
+                if (charFilterSettings == null) {
+                    charFilterSettings = ImmutableSettings.Builder.EMPTY_SETTINGS;
+                }
+
+                CharFilterFactory tokenFilterFactory = charFilterFactoryFactory.create(charFilterName, charFilterSettings);
+                charFilters.put(charFilterName, tokenFilterFactory);
+            }
+        }
+        this.charFilters = ImmutableMap.copyOf(charFilters);
+
        Map<String, TokenFilterFactory> tokenFilters = newHashMap();
        if (tokenFilterFactoryFactories != null) {
            Map<String, Settings> tokenFiltersSettings = indexSettings.getGroups("index.analysis.filter");
@ -152,6 +173,10 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
        return tokenizers.get(name);
    }

+    public CharFilterFactory charFilter(String name) {
+        return charFilters.get(name);
+    }
+
    public TokenFilterFactory tokenFilter(String name) {
        return tokenFilters.get(name);
    }
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactory.java
@ -0,0 +1,33 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.CharStream;
+import org.elasticsearch.index.IndexComponent;
+
+/**
+ * @author kimchy (shay.banon)
+ */
+public interface CharFilterFactory extends IndexComponent {
+
+    String name();
+
+    CharStream create(CharStream tokenStream);
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactoryFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactoryFactory.java
@ -0,0 +1,30 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.elasticsearch.common.settings.Settings;
+
+/**
+ * @author kimchy (shay.banon)
+ */
+public interface CharFilterFactoryFactory {
+
+    CharFilterFactory create(String name, Settings settings);
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java
@ -19,9 +19,7 @@

 package org.elasticsearch.index.analysis;

-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.*;

 import java.io.IOException;
 import java.io.Reader;
@ -33,12 +31,15 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal

    private final TokenizerFactory tokenizerFactory;

+    private final CharFilterFactory[] charFilters;
+
    private final TokenFilterFactory[] tokenFilters;

    private int positionIncrementGap = 0;

-    public CustomAnalyzer(TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilters) {
+    public CustomAnalyzer(TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters, TokenFilterFactory[] tokenFilters) {
        this.tokenizerFactory = tokenizerFactory;
+        this.charFilters = charFilters;
        this.tokenFilters = tokenFilters;
    }

@ -54,6 +55,10 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
        return tokenFilters;
    }

+    public CharFilterFactory[] charFilters() {
+        return charFilters;
+    }
+
    @Override public int getPositionIncrementGap(String fieldName) {
        return this.positionIncrementGap;
    }
@ -65,10 +70,10 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
    @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
        Holder holder = (Holder) getPreviousTokenStream();
        if (holder == null) {
-            holder = buildHolder(reader);
+            holder = buildHolder(charFilterIfNeeded(reader));
            setPreviousTokenStream(holder);
        } else {
-            holder.tokenizer.reset(reader);
+            holder.tokenizer.reset(charFilterIfNeeded(reader));
        }
        return holder.tokenStream;
    }
@ -82,7 +87,18 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
        return new Holder(tokenizer, tokenStream);
    }

-    private static class Holder {
+    private Reader charFilterIfNeeded(Reader reader) {
+        if (charFilters != null && charFilters.length > 0) {
+            CharStream charStream = CharReader.get(reader);
+            for (CharFilterFactory charFilter : charFilters) {
+                charStream = charFilter.create(charStream);
+            }
+            reader = charStream;
+        }
+        return reader;
+    }
+
+    static class Holder {
        final Tokenizer tokenizer;
        final TokenStream tokenStream;

--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java
@ -35,18 +35,21 @@ import static org.elasticsearch.common.collect.Lists.*;
 * A custom analyzer that is built out of a single {@link org.apache.lucene.analysis.Tokenizer} and a list
 * of {@link org.apache.lucene.analysis.TokenFilter}s.
 *
- * @author kimchy (Shay Banon)
+ * @author kimchy (shay.banon)
 */
 public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {

    private final TokenizerFactory tokenizerFactory;

+    private final CharFilterFactory[] charFilterFactories;
+
    private final TokenFilterFactory[] tokenFilterFactories;

    private final CustomAnalyzer customAnalyzer;

    @Inject public CustomAnalyzerProvider(Index index,
                                          Map<String, TokenizerFactoryFactory> tokenizerFactories,
+                                          Map<String, CharFilterFactoryFactory> charFilterFactories,
                                          Map<String, TokenFilterFactoryFactory> tokenFilterFactories,
                                          @IndexSettings Settings indexSettings,
                                          @Assisted String name, @Assisted Settings settings) {
@ -65,6 +68,21 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
        }
        tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, tokenizerSettings);

+        List<CharFilterFactory> charFilters = newArrayList();
+        String[] charFilterNames = settings.getAsArray("char_filter");
+        for (String charFilterName : charFilterNames) {
+            CharFilterFactoryFactory charFilterFactoryFactory = charFilterFactories.get(charFilterName);
+            if (charFilterFactoryFactory == null) {
+                throw new IllegalArgumentException("Custom Analyzer [" + name + "] failed to find char filter under name [" + charFilterName + "]");
+            }
+            Settings charFilterSettings = indexSettings.getGroups("index.analysis.char_filter").get(charFilterName);
+            if (charFilterSettings == null) {
+                charFilterSettings = ImmutableSettings.Builder.EMPTY_SETTINGS;
+            }
+            charFilters.add(charFilterFactoryFactory.create(charFilterName, charFilterSettings));
+        }
+        this.charFilterFactories = charFilters.toArray(new CharFilterFactory[charFilters.size()]);
+
        List<TokenFilterFactory> tokenFilters = newArrayList();
        String[] tokenFilterNames = settings.getAsArray("filter");
        for (String tokenFilterName : tokenFilterNames) {
@ -80,7 +98,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
        }
        this.tokenFilterFactories = tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]);

-        this.customAnalyzer = new CustomAnalyzer(this.tokenizerFactory, this.tokenFilterFactories);
+        this.customAnalyzer = new CustomAnalyzer(this.tokenizerFactory, this.charFilterFactories, this.tokenFilterFactories);
    }

    @Override public CustomAnalyzer get() {
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/HtmlStripCharFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/HtmlStripCharFilterFactory.java
@ -0,0 +1,62 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.CharStream;
+import org.elasticsearch.common.collect.ImmutableSet;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+/**
+ * @author kimchy (shay.banon)
+ */
+public class HtmlStripCharFilterFactory extends AbstractCharFilterFactory {
+
+    private final ImmutableSet<String> escapedTags;
+
+    private final int readAheadLimit;
+
+    @Inject public HtmlStripCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name);
+        this.readAheadLimit = settings.getAsInt("read_ahead", HTMLStripCharFilter.DEFAULT_READ_AHEAD);
+        String[] escapedTags = settings.getAsArray("escaped_tags");
+        if (escapedTags.length > 0) {
+            this.escapedTags = ImmutableSet.copyOf(escapedTags);
+        } else {
+            this.escapedTags = null;
+        }
+    }
+
+    public ImmutableSet<String> escapedTags() {
+        return escapedTags;
+    }
+
+    public int readAheadLimit() {
+        return readAheadLimit;
+    }
+
+    @Override public CharStream create(CharStream tokenStream) {
+        return new HTMLStripCharFilter(tokenStream, escapedTags, readAheadLimit);
+    }
+}
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.Analyzer;
 import org.elasticsearch.common.inject.Guice;
 import org.elasticsearch.common.inject.Injector;
+import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexNameModule;
@ -66,5 +67,15 @@ public class AnalysisModuleTests {
        StopTokenFilterFactory stop1 = (StopTokenFilterFactory) custom1.tokenFilters()[0];
        assertThat(stop1.stopWords().size(), equalTo(1));
        assertThat(stop1.stopWords(), hasItem("test-stop"));
+
+        analyzer = analysisService.analyzer("custom2").analyzer();
+        assertThat(analyzer, instanceOf(CustomAnalyzer.class));
+        CustomAnalyzer custom2 = (CustomAnalyzer) analyzer;
+
+        HtmlStripCharFilterFactory html = (HtmlStripCharFilterFactory) custom2.charFilters()[0];
+        assertThat(html.readAheadLimit(), equalTo(HTMLStripCharFilter.DEFAULT_READ_AHEAD));
+
+        html = (HtmlStripCharFilterFactory) custom2.charFilters()[1];
+        assertThat(html.readAheadLimit(), equalTo(1024));
    }
 }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json
@ -6,6 +6,13 @@
                    "type" : "standard"
                }
            },
+            "char_filter" : {
+                "my_html" : {
+                    "type" : "html_strip",
+                    "escaped_tags" : ["xxx", "yyy"],
+                    "read_ahead" : 1024
+                }
+            },
            "filter" : {
                "stop" : {
                    "type" : "stop",
@ -24,6 +31,10 @@
                "custom1" : {
                    "tokenizer" : "standard",
                    "filter" : ["stop", "stop2"]
+                },
+                "custom2" : {
+                    "tokenizer" : "standard",
+                    "char_filter" : ["html_strip", "my_html"]
                }
            }
        }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml
@ -3,6 +3,11 @@ index :
    tokenizer :
      standard :
        type : standard
+    char_filter :
+      my_html :
+        type : html_strip
+        escaped_tags : [xxx, yyy]
+        read_ahead : 1024
    filter :
      stop :
        type : stop
@ -17,3 +22,6 @@ index :
      custom1 :
        tokenizer : standard
        filter : [stop, stop2]
+      custom2 :
+        tokenizer : standard
+        char_filter : [html_strip, my_html]