Analysis: Add `char_filter` on top of `tokenizer`, `filter`, and `analyzer`. Add an `html_strip` char filter, closes #315.
This commit is contained in:
parent
e29925684a
commit
98bc8285ea
|
@ -19,6 +19,7 @@
|
||||||
<w>calc</w>
|
<w>calc</w>
|
||||||
<w>camelcase</w>
|
<w>camelcase</w>
|
||||||
<w>canonicalhost</w>
|
<w>canonicalhost</w>
|
||||||
|
<w>charfilter</w>
|
||||||
<w>checksum</w>
|
<w>checksum</w>
|
||||||
<w>chunking</w>
|
<w>chunking</w>
|
||||||
<w>closeable</w>
|
<w>closeable</w>
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,43 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.AbstractIndexComponent;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author kimchy (shay.banon)
|
||||||
|
*/
|
||||||
|
public abstract class AbstractCharFilterFactory extends AbstractIndexComponent implements CharFilterFactory {
|
||||||
|
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
public AbstractCharFilterFactory(Index index, @IndexSettings Settings indexSettings, String name) {
|
||||||
|
super(index, indexSettings);
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public String name() {
|
||||||
|
return this.name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -37,6 +37,34 @@ public class AnalysisModule extends AbstractModule {
|
||||||
|
|
||||||
public static class AnalysisBinderProcessor {
|
public static class AnalysisBinderProcessor {
|
||||||
|
|
||||||
|
public void processCharFilters(CharFiltersBindings charFiltersBindings) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class CharFiltersBindings {
|
||||||
|
private final MapBinder<String, CharFilterFactoryFactory> binder;
|
||||||
|
private final Map<String, Settings> groupSettings;
|
||||||
|
|
||||||
|
public CharFiltersBindings(MapBinder<String, CharFilterFactoryFactory> binder, Map<String, Settings> groupSettings) {
|
||||||
|
this.binder = binder;
|
||||||
|
this.groupSettings = groupSettings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public MapBinder<String, CharFilterFactoryFactory> binder() {
|
||||||
|
return binder;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Settings> groupSettings() {
|
||||||
|
return groupSettings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void processCharFilter(String name, Class<? extends CharFilterFactory> charFilterFactory) {
|
||||||
|
if (!groupSettings.containsKey(name)) {
|
||||||
|
binder.addBinding(name).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, charFilterFactory)).in(Scopes.SINGLETON);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -159,6 +187,27 @@ public class AnalysisModule extends AbstractModule {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override protected void configure() {
|
@Override protected void configure() {
|
||||||
|
MapBinder<String, CharFilterFactoryFactory> charFilterBinder
|
||||||
|
= MapBinder.newMapBinder(binder(), String.class, CharFilterFactoryFactory.class);
|
||||||
|
|
||||||
|
Map<String, Settings> charFiltersSettings = settings.getGroups("index.analysis.char_filter");
|
||||||
|
for (Map.Entry<String, Settings> entry : charFiltersSettings.entrySet()) {
|
||||||
|
String charFilterName = entry.getKey();
|
||||||
|
Settings charFilterSettings = entry.getValue();
|
||||||
|
|
||||||
|
Class<? extends CharFilterFactory> type = charFilterSettings.getAsClass("type", null, "org.elasticsearch.index.analysis.", "CharFilterFactory");
|
||||||
|
if (type == null) {
|
||||||
|
throw new IllegalArgumentException("Char Filter [" + charFilterName + "] must have a type associated with it");
|
||||||
|
}
|
||||||
|
charFilterBinder.addBinding(charFilterName).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, type)).in(Scopes.SINGLETON);
|
||||||
|
}
|
||||||
|
|
||||||
|
AnalysisBinderProcessor.CharFiltersBindings charFiltersBindings = new AnalysisBinderProcessor.CharFiltersBindings(charFilterBinder, charFiltersSettings);
|
||||||
|
for (AnalysisBinderProcessor processor : processors) {
|
||||||
|
processor.processCharFilters(charFiltersBindings);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
MapBinder<String, TokenFilterFactoryFactory> tokenFilterBinder
|
MapBinder<String, TokenFilterFactoryFactory> tokenFilterBinder
|
||||||
= MapBinder.newMapBinder(binder(), String.class, TokenFilterFactoryFactory.class);
|
= MapBinder.newMapBinder(binder(), String.class, TokenFilterFactoryFactory.class);
|
||||||
|
|
||||||
|
@ -230,6 +279,11 @@ public class AnalysisModule extends AbstractModule {
|
||||||
|
|
||||||
private static class DefaultProcessor extends AnalysisBinderProcessor {
|
private static class DefaultProcessor extends AnalysisBinderProcessor {
|
||||||
|
|
||||||
|
@Override public void processCharFilters(CharFiltersBindings charFiltersBindings) {
|
||||||
|
charFiltersBindings.processCharFilter("html_strip", HtmlStripCharFilterFactory.class);
|
||||||
|
charFiltersBindings.processCharFilter("htmlStrip", HtmlStripCharFilterFactory.class);
|
||||||
|
}
|
||||||
|
|
||||||
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||||
tokenFiltersBindings.processTokenFilter("stop", StopTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("stop", StopTokenFilterFactory.class);
|
||||||
tokenFiltersBindings.processTokenFilter("asciifolding", ASCIIFoldingTokenFilterFactory.class);
|
tokenFiltersBindings.processTokenFilter("asciifolding", ASCIIFoldingTokenFilterFactory.class);
|
||||||
|
|
|
@ -42,15 +42,18 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
|
||||||
|
|
||||||
private final ImmutableMap<String, TokenizerFactory> tokenizers;
|
private final ImmutableMap<String, TokenizerFactory> tokenizers;
|
||||||
|
|
||||||
|
private final ImmutableMap<String, CharFilterFactory> charFilters;
|
||||||
|
|
||||||
private final ImmutableMap<String, TokenFilterFactory> tokenFilters;
|
private final ImmutableMap<String, TokenFilterFactory> tokenFilters;
|
||||||
|
|
||||||
public AnalysisService(Index index) {
|
public AnalysisService(Index index) {
|
||||||
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS, null, null, null);
|
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS, null, null, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Inject public AnalysisService(Index index, @IndexSettings Settings indexSettings,
|
@Inject public AnalysisService(Index index, @IndexSettings Settings indexSettings,
|
||||||
@Nullable Map<String, AnalyzerProviderFactory> analyzerFactoryFactories,
|
@Nullable Map<String, AnalyzerProviderFactory> analyzerFactoryFactories,
|
||||||
@Nullable Map<String, TokenizerFactoryFactory> tokenizerFactoryFactories,
|
@Nullable Map<String, TokenizerFactoryFactory> tokenizerFactoryFactories,
|
||||||
|
@Nullable Map<String, CharFilterFactoryFactory> charFilterFactoryFactories,
|
||||||
@Nullable Map<String, TokenFilterFactoryFactory> tokenFilterFactoryFactories) {
|
@Nullable Map<String, TokenFilterFactoryFactory> tokenFilterFactoryFactories) {
|
||||||
super(index, indexSettings);
|
super(index, indexSettings);
|
||||||
|
|
||||||
|
@ -105,6 +108,24 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
|
||||||
}
|
}
|
||||||
this.tokenizers = ImmutableMap.copyOf(tokenizers);
|
this.tokenizers = ImmutableMap.copyOf(tokenizers);
|
||||||
|
|
||||||
|
Map<String, CharFilterFactory> charFilters = newHashMap();
|
||||||
|
if (charFilterFactoryFactories != null) {
|
||||||
|
Map<String, Settings> charFiltersSettings = indexSettings.getGroups("index.analysis.char_filter");
|
||||||
|
for (Map.Entry<String, CharFilterFactoryFactory> entry : charFilterFactoryFactories.entrySet()) {
|
||||||
|
String charFilterName = entry.getKey();
|
||||||
|
CharFilterFactoryFactory charFilterFactoryFactory = entry.getValue();
|
||||||
|
|
||||||
|
Settings charFilterSettings = charFiltersSettings.get(charFilterName);
|
||||||
|
if (charFilterSettings == null) {
|
||||||
|
charFilterSettings = ImmutableSettings.Builder.EMPTY_SETTINGS;
|
||||||
|
}
|
||||||
|
|
||||||
|
CharFilterFactory tokenFilterFactory = charFilterFactoryFactory.create(charFilterName, charFilterSettings);
|
||||||
|
charFilters.put(charFilterName, tokenFilterFactory);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.charFilters = ImmutableMap.copyOf(charFilters);
|
||||||
|
|
||||||
Map<String, TokenFilterFactory> tokenFilters = newHashMap();
|
Map<String, TokenFilterFactory> tokenFilters = newHashMap();
|
||||||
if (tokenFilterFactoryFactories != null) {
|
if (tokenFilterFactoryFactories != null) {
|
||||||
Map<String, Settings> tokenFiltersSettings = indexSettings.getGroups("index.analysis.filter");
|
Map<String, Settings> tokenFiltersSettings = indexSettings.getGroups("index.analysis.filter");
|
||||||
|
@ -152,6 +173,10 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
|
||||||
return tokenizers.get(name);
|
return tokenizers.get(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CharFilterFactory charFilter(String name) {
|
||||||
|
return charFilters.get(name);
|
||||||
|
}
|
||||||
|
|
||||||
public TokenFilterFactory tokenFilter(String name) {
|
public TokenFilterFactory tokenFilter(String name) {
|
||||||
return tokenFilters.get(name);
|
return tokenFilters.get(name);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
import org.elasticsearch.index.IndexComponent;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author kimchy (shay.banon)
|
||||||
|
*/
|
||||||
|
public interface CharFilterFactory extends IndexComponent {
|
||||||
|
|
||||||
|
String name();
|
||||||
|
|
||||||
|
CharStream create(CharStream tokenStream);
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author kimchy (shay.banon)
|
||||||
|
*/
|
||||||
|
public interface CharFilterFactoryFactory {
|
||||||
|
|
||||||
|
CharFilterFactory create(String name, Settings settings);
|
||||||
|
}
|
|
@ -19,9 +19,7 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -33,12 +31,15 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
|
||||||
|
|
||||||
private final TokenizerFactory tokenizerFactory;
|
private final TokenizerFactory tokenizerFactory;
|
||||||
|
|
||||||
|
private final CharFilterFactory[] charFilters;
|
||||||
|
|
||||||
private final TokenFilterFactory[] tokenFilters;
|
private final TokenFilterFactory[] tokenFilters;
|
||||||
|
|
||||||
private int positionIncrementGap = 0;
|
private int positionIncrementGap = 0;
|
||||||
|
|
||||||
public CustomAnalyzer(TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilters) {
|
public CustomAnalyzer(TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters, TokenFilterFactory[] tokenFilters) {
|
||||||
this.tokenizerFactory = tokenizerFactory;
|
this.tokenizerFactory = tokenizerFactory;
|
||||||
|
this.charFilters = charFilters;
|
||||||
this.tokenFilters = tokenFilters;
|
this.tokenFilters = tokenFilters;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -54,6 +55,10 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
|
||||||
return tokenFilters;
|
return tokenFilters;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CharFilterFactory[] charFilters() {
|
||||||
|
return charFilters;
|
||||||
|
}
|
||||||
|
|
||||||
@Override public int getPositionIncrementGap(String fieldName) {
|
@Override public int getPositionIncrementGap(String fieldName) {
|
||||||
return this.positionIncrementGap;
|
return this.positionIncrementGap;
|
||||||
}
|
}
|
||||||
|
@ -65,10 +70,10 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
|
||||||
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
Holder holder = (Holder) getPreviousTokenStream();
|
Holder holder = (Holder) getPreviousTokenStream();
|
||||||
if (holder == null) {
|
if (holder == null) {
|
||||||
holder = buildHolder(reader);
|
holder = buildHolder(charFilterIfNeeded(reader));
|
||||||
setPreviousTokenStream(holder);
|
setPreviousTokenStream(holder);
|
||||||
} else {
|
} else {
|
||||||
holder.tokenizer.reset(reader);
|
holder.tokenizer.reset(charFilterIfNeeded(reader));
|
||||||
}
|
}
|
||||||
return holder.tokenStream;
|
return holder.tokenStream;
|
||||||
}
|
}
|
||||||
|
@ -82,7 +87,18 @@ public class CustomAnalyzer extends Analyzer implements PositionIncrementGapAnal
|
||||||
return new Holder(tokenizer, tokenStream);
|
return new Holder(tokenizer, tokenStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class Holder {
|
private Reader charFilterIfNeeded(Reader reader) {
|
||||||
|
if (charFilters != null && charFilters.length > 0) {
|
||||||
|
CharStream charStream = CharReader.get(reader);
|
||||||
|
for (CharFilterFactory charFilter : charFilters) {
|
||||||
|
charStream = charFilter.create(charStream);
|
||||||
|
}
|
||||||
|
reader = charStream;
|
||||||
|
}
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
static class Holder {
|
||||||
final Tokenizer tokenizer;
|
final Tokenizer tokenizer;
|
||||||
final TokenStream tokenStream;
|
final TokenStream tokenStream;
|
||||||
|
|
||||||
|
|
|
@ -35,18 +35,21 @@ import static org.elasticsearch.common.collect.Lists.*;
|
||||||
* A custom analyzer that is built out of a single {@link org.apache.lucene.analysis.Tokenizer} and a list
|
* A custom analyzer that is built out of a single {@link org.apache.lucene.analysis.Tokenizer} and a list
|
||||||
* of {@link org.apache.lucene.analysis.TokenFilter}s.
|
* of {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||||
*
|
*
|
||||||
* @author kimchy (Shay Banon)
|
* @author kimchy (shay.banon)
|
||||||
*/
|
*/
|
||||||
public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
|
public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
|
||||||
|
|
||||||
private final TokenizerFactory tokenizerFactory;
|
private final TokenizerFactory tokenizerFactory;
|
||||||
|
|
||||||
|
private final CharFilterFactory[] charFilterFactories;
|
||||||
|
|
||||||
private final TokenFilterFactory[] tokenFilterFactories;
|
private final TokenFilterFactory[] tokenFilterFactories;
|
||||||
|
|
||||||
private final CustomAnalyzer customAnalyzer;
|
private final CustomAnalyzer customAnalyzer;
|
||||||
|
|
||||||
@Inject public CustomAnalyzerProvider(Index index,
|
@Inject public CustomAnalyzerProvider(Index index,
|
||||||
Map<String, TokenizerFactoryFactory> tokenizerFactories,
|
Map<String, TokenizerFactoryFactory> tokenizerFactories,
|
||||||
|
Map<String, CharFilterFactoryFactory> charFilterFactories,
|
||||||
Map<String, TokenFilterFactoryFactory> tokenFilterFactories,
|
Map<String, TokenFilterFactoryFactory> tokenFilterFactories,
|
||||||
@IndexSettings Settings indexSettings,
|
@IndexSettings Settings indexSettings,
|
||||||
@Assisted String name, @Assisted Settings settings) {
|
@Assisted String name, @Assisted Settings settings) {
|
||||||
|
@ -65,6 +68,21 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
||||||
}
|
}
|
||||||
tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, tokenizerSettings);
|
tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, tokenizerSettings);
|
||||||
|
|
||||||
|
List<CharFilterFactory> charFilters = newArrayList();
|
||||||
|
String[] charFilterNames = settings.getAsArray("char_filter");
|
||||||
|
for (String charFilterName : charFilterNames) {
|
||||||
|
CharFilterFactoryFactory charFilterFactoryFactory = charFilterFactories.get(charFilterName);
|
||||||
|
if (charFilterFactoryFactory == null) {
|
||||||
|
throw new IllegalArgumentException("Custom Analyzer [" + name + "] failed to find char filter under name [" + charFilterName + "]");
|
||||||
|
}
|
||||||
|
Settings charFilterSettings = indexSettings.getGroups("index.analysis.char_filter").get(charFilterName);
|
||||||
|
if (charFilterSettings == null) {
|
||||||
|
charFilterSettings = ImmutableSettings.Builder.EMPTY_SETTINGS;
|
||||||
|
}
|
||||||
|
charFilters.add(charFilterFactoryFactory.create(charFilterName, charFilterSettings));
|
||||||
|
}
|
||||||
|
this.charFilterFactories = charFilters.toArray(new CharFilterFactory[charFilters.size()]);
|
||||||
|
|
||||||
List<TokenFilterFactory> tokenFilters = newArrayList();
|
List<TokenFilterFactory> tokenFilters = newArrayList();
|
||||||
String[] tokenFilterNames = settings.getAsArray("filter");
|
String[] tokenFilterNames = settings.getAsArray("filter");
|
||||||
for (String tokenFilterName : tokenFilterNames) {
|
for (String tokenFilterName : tokenFilterNames) {
|
||||||
|
@ -80,7 +98,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
|
||||||
}
|
}
|
||||||
this.tokenFilterFactories = tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]);
|
this.tokenFilterFactories = tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]);
|
||||||
|
|
||||||
this.customAnalyzer = new CustomAnalyzer(this.tokenizerFactory, this.tokenFilterFactories);
|
this.customAnalyzer = new CustomAnalyzer(this.tokenizerFactory, this.charFilterFactories, this.tokenFilterFactories);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public CustomAnalyzer get() {
|
@Override public CustomAnalyzer get() {
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
import org.elasticsearch.common.collect.ImmutableSet;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author kimchy (shay.banon)
|
||||||
|
*/
|
||||||
|
public class HtmlStripCharFilterFactory extends AbstractCharFilterFactory {
|
||||||
|
|
||||||
|
private final ImmutableSet<String> escapedTags;
|
||||||
|
|
||||||
|
private final int readAheadLimit;
|
||||||
|
|
||||||
|
@Inject public HtmlStripCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name);
|
||||||
|
this.readAheadLimit = settings.getAsInt("read_ahead", HTMLStripCharFilter.DEFAULT_READ_AHEAD);
|
||||||
|
String[] escapedTags = settings.getAsArray("escaped_tags");
|
||||||
|
if (escapedTags.length > 0) {
|
||||||
|
this.escapedTags = ImmutableSet.copyOf(escapedTags);
|
||||||
|
} else {
|
||||||
|
this.escapedTags = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public ImmutableSet<String> escapedTags() {
|
||||||
|
return escapedTags;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int readAheadLimit() {
|
||||||
|
return readAheadLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public CharStream create(CharStream tokenStream) {
|
||||||
|
return new HTMLStripCharFilter(tokenStream, escapedTags, readAheadLimit);
|
||||||
|
}
|
||||||
|
}
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.elasticsearch.common.inject.Guice;
|
import org.elasticsearch.common.inject.Guice;
|
||||||
import org.elasticsearch.common.inject.Injector;
|
import org.elasticsearch.common.inject.Injector;
|
||||||
|
import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.IndexNameModule;
|
import org.elasticsearch.index.IndexNameModule;
|
||||||
|
@ -66,5 +67,15 @@ public class AnalysisModuleTests {
|
||||||
StopTokenFilterFactory stop1 = (StopTokenFilterFactory) custom1.tokenFilters()[0];
|
StopTokenFilterFactory stop1 = (StopTokenFilterFactory) custom1.tokenFilters()[0];
|
||||||
assertThat(stop1.stopWords().size(), equalTo(1));
|
assertThat(stop1.stopWords().size(), equalTo(1));
|
||||||
assertThat(stop1.stopWords(), hasItem("test-stop"));
|
assertThat(stop1.stopWords(), hasItem("test-stop"));
|
||||||
|
|
||||||
|
analyzer = analysisService.analyzer("custom2").analyzer();
|
||||||
|
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
|
||||||
|
CustomAnalyzer custom2 = (CustomAnalyzer) analyzer;
|
||||||
|
|
||||||
|
HtmlStripCharFilterFactory html = (HtmlStripCharFilterFactory) custom2.charFilters()[0];
|
||||||
|
assertThat(html.readAheadLimit(), equalTo(HTMLStripCharFilter.DEFAULT_READ_AHEAD));
|
||||||
|
|
||||||
|
html = (HtmlStripCharFilterFactory) custom2.charFilters()[1];
|
||||||
|
assertThat(html.readAheadLimit(), equalTo(1024));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,13 @@
|
||||||
"type" : "standard"
|
"type" : "standard"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"char_filter" : {
|
||||||
|
"my_html" : {
|
||||||
|
"type" : "html_strip",
|
||||||
|
"escaped_tags" : ["xxx", "yyy"],
|
||||||
|
"read_ahead" : 1024
|
||||||
|
}
|
||||||
|
},
|
||||||
"filter" : {
|
"filter" : {
|
||||||
"stop" : {
|
"stop" : {
|
||||||
"type" : "stop",
|
"type" : "stop",
|
||||||
|
@ -24,6 +31,10 @@
|
||||||
"custom1" : {
|
"custom1" : {
|
||||||
"tokenizer" : "standard",
|
"tokenizer" : "standard",
|
||||||
"filter" : ["stop", "stop2"]
|
"filter" : ["stop", "stop2"]
|
||||||
|
},
|
||||||
|
"custom2" : {
|
||||||
|
"tokenizer" : "standard",
|
||||||
|
"char_filter" : ["html_strip", "my_html"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,11 @@ index :
|
||||||
tokenizer :
|
tokenizer :
|
||||||
standard :
|
standard :
|
||||||
type : standard
|
type : standard
|
||||||
|
char_filter :
|
||||||
|
my_html :
|
||||||
|
type : html_strip
|
||||||
|
escaped_tags : [xxx, yyy]
|
||||||
|
read_ahead : 1024
|
||||||
filter :
|
filter :
|
||||||
stop :
|
stop :
|
||||||
type : stop
|
type : stop
|
||||||
|
@ -17,3 +22,6 @@ index :
|
||||||
custom1 :
|
custom1 :
|
||||||
tokenizer : standard
|
tokenizer : standard
|
||||||
filter : [stop, stop2]
|
filter : [stop, stop2]
|
||||||
|
custom2 :
|
||||||
|
tokenizer : standard
|
||||||
|
char_filter : [html_strip, my_html]
|
||||||
|
|
Loading…
Reference in New Issue