Analysis: Add `char_filter` on top of `tokenizer`, `filter`, and `analyzer`. Add an `html_strip` char filter, closes #315.

This commit is contained in:
kimchy 2010-08-12 18:16:30 +03:00
parent e29925684a
commit 98bc8285ea
13 changed files with 1695 additions and 10 deletions

View File

@ -19,6 +19,7 @@
<w>calc</w> <w>calc</w>
<w>camelcase</w> <w>camelcase</w>
<w>canonicalhost</w> <w>canonicalhost</w>
<w>charfilter</w>
<w>checksum</w> <w>checksum</w>
<w>chunking</w> <w>chunking</w>
<w>closeable</w> <w>closeable</w>

View File

@ -0,0 +1,43 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.AbstractIndexComponent;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
/**
* @author kimchy (shay.banon)
*/
public abstract class AbstractCharFilterFactory extends AbstractIndexComponent implements CharFilterFactory {
private final String name;
public AbstractCharFilterFactory(Index index, @IndexSettings Settings indexSettings, String name) {
super(index, indexSettings);
this.name = name;
}
@Override public String name() {
return this.name;
}
}

View File

@ -37,6 +37,34 @@ public class AnalysisModule extends AbstractModule {
public static class AnalysisBinderProcessor { public static class AnalysisBinderProcessor {
public void processCharFilters(CharFiltersBindings charFiltersBindings) {
}
public static class CharFiltersBindings {
private final MapBinder<String, CharFilterFactoryFactory> binder;
private final Map<String, Settings> groupSettings;
public CharFiltersBindings(MapBinder<String, CharFilterFactoryFactory> binder, Map<String, Settings> groupSettings) {
this.binder = binder;
this.groupSettings = groupSettings;
}
public MapBinder<String, CharFilterFactoryFactory> binder() {
return binder;
}
public Map<String, Settings> groupSettings() {
return groupSettings;
}
public void processCharFilter(String name, Class<? extends CharFilterFactory> charFilterFactory) {
if (!groupSettings.containsKey(name)) {
binder.addBinding(name).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, charFilterFactory)).in(Scopes.SINGLETON);
}
}
}
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
} }
@ -159,6 +187,27 @@ public class AnalysisModule extends AbstractModule {
} }
@Override protected void configure() { @Override protected void configure() {
MapBinder<String, CharFilterFactoryFactory> charFilterBinder
= MapBinder.newMapBinder(binder(), String.class, CharFilterFactoryFactory.class);
Map<String, Settings> charFiltersSettings = settings.getGroups("index.analysis.char_filter");
for (Map.Entry<String, Settings> entry : charFiltersSettings.entrySet()) {
String charFilterName = entry.getKey();
Settings charFilterSettings = entry.getValue();
Class<? extends CharFilterFactory> type = charFilterSettings.getAsClass("type", null, "org.elasticsearch.index.analysis.", "CharFilterFactory");
if (type == null) {
throw new IllegalArgumentException("Char Filter [" + charFilterName + "] must have a type associated with it");
}
charFilterBinder.addBinding(charFilterName).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, type)).in(Scopes.SINGLETON);
}
AnalysisBinderProcessor.CharFiltersBindings charFiltersBindings = new AnalysisBinderProcessor.CharFiltersBindings(charFilterBinder, charFiltersSettings);
for (AnalysisBinderProcessor processor : processors) {
processor.processCharFilters(charFiltersBindings);
}
MapBinder<String, TokenFilterFactoryFactory> tokenFilterBinder MapBinder<String, TokenFilterFactoryFactory> tokenFilterBinder
= MapBinder.newMapBinder(binder(), String.class, TokenFilterFactoryFactory.class); = MapBinder.newMapBinder(binder(), String.class, TokenFilterFactoryFactory.class);
@ -230,6 +279,11 @@ public class AnalysisModule extends AbstractModule {
private static class DefaultProcessor extends AnalysisBinderProcessor { private static class DefaultProcessor extends AnalysisBinderProcessor {
@Override public void processCharFilters(CharFiltersBindings charFiltersBindings) {
charFiltersBindings.processCharFilter("html_strip", HtmlStripCharFilterFactory.class);
charFiltersBindings.processCharFilter("htmlStrip", HtmlStripCharFilterFactory.class);
}
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { @Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("stop", StopTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("stop", StopTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("asciifolding", ASCIIFoldingTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("asciifolding", ASCIIFoldingTokenFilterFactory.class);

View File

@ -42,15 +42,18 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
private final ImmutableMap<String, TokenizerFactory> tokenizers; private final ImmutableMap<String, TokenizerFactory> tokenizers;
private final ImmutableMap<String, CharFilterFactory> charFilters;
private final ImmutableMap<String, TokenFilterFactory> tokenFilters; private final ImmutableMap<String, TokenFilterFactory> tokenFilters;
public AnalysisService(Index index) { public AnalysisService(Index index) {
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS, null, null, null); this(index, ImmutableSettings.Builder.EMPTY_SETTINGS, null, null, null, null);
} }
@Inject public AnalysisService(Index index, @IndexSettings Settings indexSettings, @Inject public AnalysisService(Index index, @IndexSettings Settings indexSettings,
@Nullable Map<String, AnalyzerProviderFactory> analyzerFactoryFactories, @Nullable Map<String, AnalyzerProviderFactory> analyzerFactoryFactories,
@Nullable Map<String, TokenizerFactoryFactory> tokenizerFactoryFactories, @Nullable Map<String, TokenizerFactoryFactory> tokenizerFactoryFactories,
@Nullable Map<String, CharFilterFactoryFactory> charFilterFactoryFactories,
@Nullable Map<String, TokenFilterFactoryFactory> tokenFilterFactoryFactories) { @Nullable Map<String, TokenFilterFactoryFactory> tokenFilterFactoryFactories) {
super(index, indexSettings); super(index, indexSettings);
@ -105,6 +108,24 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
} }
this.tokenizers = ImmutableMap.copyOf(tokenizers); this.tokenizers = ImmutableMap.copyOf(tokenizers);
Map<String, CharFilterFactory> charFilters = newHashMap();
if (charFilterFactoryFactories != null) {
Map<String, Settings> charFiltersSettings = indexSettings.getGroups("index.analysis.char_filter");
for (Map.Entry<String, CharFilterFactoryFactory> entry : charFilterFactoryFactories.entrySet()) {
String charFilterName = entry.getKey();
CharFilterFactoryFactory charFilterFactoryFactory = entry.getValue();
Settings charFilterSettings = charFiltersSettings.get(charFilterName);
if (charFilterSettings == null) {
charFilterSettings = ImmutableSettings.Builder.EMPTY_SETTINGS;
}
CharFilterFactory tokenFilterFactory = charFilterFactoryFactory.create(charFilterName, charFilterSettings);
charFilters.put(charFilterName, tokenFilterFactory);
}
}
this.charFilters = ImmutableMap.copyOf(charFilters);
Map<String, TokenFilterFactory> tokenFilters = newHashMap(); Map<String, TokenFilterFactory> tokenFilters = newHashMap();
if (tokenFilterFactoryFactories != null) { if (tokenFilterFactoryFactories != null) {
Map<String, Settings> tokenFiltersSettings = indexSettings.getGroups("index.analysis.filter"); Map<String, Settings> tokenFiltersSettings = indexSettings.getGroups("index.analysis.filter");
@ -152,6 +173,10 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
return tokenizers.get(name); return tokenizers.get(name);
} }
public CharFilterFactory charFilter(String name) {
return charFilters.get(name);
}
public TokenFilterFactory tokenFilter(String name) { public TokenFilterFactory tokenFilter(String name) {
return tokenFilters.get(name); return tokenFilters.get(name);
} }

View File

@ -0,0 +1,33 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.CharStream;
import org.elasticsearch.index.IndexComponent;
/**
* @author kimchy (shay.banon)
*/
public interface CharFilterFactory extends IndexComponent {
String name();
CharStream create(CharStream tokenStream);
}

View File

@ -0,0 +1,30 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.elasticsearch.common.settings.Settings;
/**
* @author kimchy (shay.banon)
*/
public interface CharFilterFactoryFactory {
CharFilterFactory create(String name, Settings settings);
}

View File

@ -19,9 +19,7 @@
package org.elasticsearch.index.analysis; package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
@ -33,12 +31,15 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
private final TokenizerFactory tokenizerFactory; private final TokenizerFactory tokenizerFactory;
private final CharFilterFactory[] charFilters;
private final TokenFilterFactory[] tokenFilters; private final TokenFilterFactory[] tokenFilters;
private int positionIncrementGap = 0; private int positionIncrementGap = 0;
public CustomAnalyzer(TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilters) { public CustomAnalyzer(TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters, TokenFilterFactory[] tokenFilters) {
this.tokenizerFactory = tokenizerFactory; this.tokenizerFactory = tokenizerFactory;
this.charFilters = charFilters;
this.tokenFilters = tokenFilters; this.tokenFilters = tokenFilters;
} }
@ -54,6 +55,10 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
return tokenFilters; return tokenFilters;
} }
public CharFilterFactory[] charFilters() {
return charFilters;
}
@Override public int getPositionIncrementGap(String fieldName) { @Override public int getPositionIncrementGap(String fieldName) {
return this.positionIncrementGap; return this.positionIncrementGap;
} }
@ -65,10 +70,10 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Holder holder = (Holder) getPreviousTokenStream(); Holder holder = (Holder) getPreviousTokenStream();
if (holder == null) { if (holder == null) {
holder = buildHolder(reader); holder = buildHolder(charFilterIfNeeded(reader));
setPreviousTokenStream(holder); setPreviousTokenStream(holder);
} else { } else {
holder.tokenizer.reset(reader); holder.tokenizer.reset(charFilterIfNeeded(reader));
} }
return holder.tokenStream; return holder.tokenStream;
} }
@ -82,7 +87,18 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
return new Holder(tokenizer, tokenStream); return new Holder(tokenizer, tokenStream);
} }
private static class Holder { private Reader charFilterIfNeeded(Reader reader) {
if (charFilters != null && charFilters.length > 0) {
CharStream charStream = CharReader.get(reader);
for (CharFilterFactory charFilter : charFilters) {
charStream = charFilter.create(charStream);
}
reader = charStream;
}
return reader;
}
static class Holder {
final Tokenizer tokenizer; final Tokenizer tokenizer;
final TokenStream tokenStream; final TokenStream tokenStream;

View File

@ -35,18 +35,21 @@ import static org.elasticsearch.common.collect.Lists.*;
* A custom analyzer that is built out of a single {@link org.apache.lucene.analysis.Tokenizer} and a list * A custom analyzer that is built out of a single {@link org.apache.lucene.analysis.Tokenizer} and a list
* of {@link org.apache.lucene.analysis.TokenFilter}s. * of {@link org.apache.lucene.analysis.TokenFilter}s.
* *
* @author kimchy (Shay Banon) * @author kimchy (shay.banon)
*/ */
public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> { public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
private final TokenizerFactory tokenizerFactory; private final TokenizerFactory tokenizerFactory;
private final CharFilterFactory[] charFilterFactories;
private final TokenFilterFactory[] tokenFilterFactories; private final TokenFilterFactory[] tokenFilterFactories;
private final CustomAnalyzer customAnalyzer; private final CustomAnalyzer customAnalyzer;
@Inject public CustomAnalyzerProvider(Index index, @Inject public CustomAnalyzerProvider(Index index,
Map<String, TokenizerFactoryFactory> tokenizerFactories, Map<String, TokenizerFactoryFactory> tokenizerFactories,
Map<String, CharFilterFactoryFactory> charFilterFactories,
Map<String, TokenFilterFactoryFactory> tokenFilterFactories, Map<String, TokenFilterFactoryFactory> tokenFilterFactories,
@IndexSettings Settings indexSettings, @IndexSettings Settings indexSettings,
@Assisted String name, @Assisted Settings settings) { @Assisted String name, @Assisted Settings settings) {
@ -65,6 +68,21 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
} }
tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, tokenizerSettings); tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, tokenizerSettings);
List<CharFilterFactory> charFilters = newArrayList();
String[] charFilterNames = settings.getAsArray("char_filter");
for (String charFilterName : charFilterNames) {
CharFilterFactoryFactory charFilterFactoryFactory = charFilterFactories.get(charFilterName);
if (charFilterFactoryFactory == null) {
throw new IllegalArgumentException("Custom Analyzer [" + name + "] failed to find char filter under name [" + charFilterName + "]");
}
Settings charFilterSettings = indexSettings.getGroups("index.analysis.char_filter").get(charFilterName);
if (charFilterSettings == null) {
charFilterSettings = ImmutableSettings.Builder.EMPTY_SETTINGS;
}
charFilters.add(charFilterFactoryFactory.create(charFilterName, charFilterSettings));
}
this.charFilterFactories = charFilters.toArray(new CharFilterFactory[charFilters.size()]);
List<TokenFilterFactory> tokenFilters = newArrayList(); List<TokenFilterFactory> tokenFilters = newArrayList();
String[] tokenFilterNames = settings.getAsArray("filter"); String[] tokenFilterNames = settings.getAsArray("filter");
for (String tokenFilterName : tokenFilterNames) { for (String tokenFilterName : tokenFilterNames) {
@ -80,7 +98,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
} }
this.tokenFilterFactories = tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]); this.tokenFilterFactories = tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]);
this.customAnalyzer = new CustomAnalyzer(this.tokenizerFactory, this.tokenFilterFactories); this.customAnalyzer = new CustomAnalyzer(this.tokenizerFactory, this.charFilterFactories, this.tokenFilterFactories);
} }
@Override public CustomAnalyzer get() { @Override public CustomAnalyzer get() {

View File

@ -0,0 +1,62 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.CharStream;
import org.elasticsearch.common.collect.ImmutableSet;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
/**
* @author kimchy (shay.banon)
*/
public class HtmlStripCharFilterFactory extends AbstractCharFilterFactory {
private final ImmutableSet<String> escapedTags;
private final int readAheadLimit;
@Inject public HtmlStripCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
this.readAheadLimit = settings.getAsInt("read_ahead", HTMLStripCharFilter.DEFAULT_READ_AHEAD);
String[] escapedTags = settings.getAsArray("escaped_tags");
if (escapedTags.length > 0) {
this.escapedTags = ImmutableSet.copyOf(escapedTags);
} else {
this.escapedTags = null;
}
}
public ImmutableSet<String> escapedTags() {
return escapedTags;
}
public int readAheadLimit() {
return readAheadLimit;
}
@Override public CharStream create(CharStream tokenStream) {
return new HTMLStripCharFilter(tokenStream, escapedTags, readAheadLimit);
}
}

View File

@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.common.inject.Guice; import org.elasticsearch.common.inject.Guice;
import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index; import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule; import org.elasticsearch.index.IndexNameModule;
@ -66,5 +67,15 @@ public class AnalysisModuleTests {
StopTokenFilterFactory stop1 = (StopTokenFilterFactory) custom1.tokenFilters()[0]; StopTokenFilterFactory stop1 = (StopTokenFilterFactory) custom1.tokenFilters()[0];
assertThat(stop1.stopWords().size(), equalTo(1)); assertThat(stop1.stopWords().size(), equalTo(1));
assertThat(stop1.stopWords(), hasItem("test-stop")); assertThat(stop1.stopWords(), hasItem("test-stop"));
analyzer = analysisService.analyzer("custom2").analyzer();
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer custom2 = (CustomAnalyzer) analyzer;
HtmlStripCharFilterFactory html = (HtmlStripCharFilterFactory) custom2.charFilters()[0];
assertThat(html.readAheadLimit(), equalTo(HTMLStripCharFilter.DEFAULT_READ_AHEAD));
html = (HtmlStripCharFilterFactory) custom2.charFilters()[1];
assertThat(html.readAheadLimit(), equalTo(1024));
} }
} }

View File

@ -6,6 +6,13 @@
"type" : "standard" "type" : "standard"
} }
}, },
"char_filter" : {
"my_html" : {
"type" : "html_strip",
"escaped_tags" : ["xxx", "yyy"],
"read_ahead" : 1024
}
},
"filter" : { "filter" : {
"stop" : { "stop" : {
"type" : "stop", "type" : "stop",
@ -24,6 +31,10 @@
"custom1" : { "custom1" : {
"tokenizer" : "standard", "tokenizer" : "standard",
"filter" : ["stop", "stop2"] "filter" : ["stop", "stop2"]
},
"custom2" : {
"tokenizer" : "standard",
"char_filter" : ["html_strip", "my_html"]
} }
} }
} }

View File

@ -3,6 +3,11 @@ index :
tokenizer : tokenizer :
standard : standard :
type : standard type : standard
char_filter :
my_html :
type : html_strip
escaped_tags : [xxx, yyy]
read_ahead : 1024
filter : filter :
stop : stop :
type : stop type : stop
@ -17,3 +22,6 @@ index :
custom1 : custom1 :
tokenizer : standard tokenizer : standard
filter : [stop, stop2] filter : [stop, stop2]
custom2 :
tokenizer : standard
char_filter : [html_strip, my_html]