From 7e55354f4ac2c8eae7d6158ed5371aa4b7b96f24 Mon Sep 17 00:00:00 2001 From: Alexander Reelsen Date: Mon, 24 Jun 2013 18:41:39 +0200 Subject: [PATCH] Added support for PatternReplaceCharFilter PatternReplaceCharFilter allows the use of a regex to manipulate the characters in a string before analysis Closes #3197 --- .../index/analysis/AnalysisModule.java | 1 + .../PatternReplaceCharFilterFactory.java | 66 +++++++++++++++++++ .../index/analysis/AnalysisModuleTests.java | 8 +++ .../test/unit/index/analysis/test1.json | 9 +++ .../test/unit/index/analysis/test1.yml | 7 ++ 5 files changed, 91 insertions(+) create mode 100644 src/main/java/org/elasticsearch/index/analysis/PatternReplaceCharFilterFactory.java diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index 355eb8f62c1..dee7200baf2 100644 --- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -417,6 +417,7 @@ public class AnalysisModule extends AbstractModule { @Override public void processCharFilters(CharFiltersBindings charFiltersBindings) { charFiltersBindings.processCharFilter("html_strip", HtmlStripCharFilterFactory.class); + charFiltersBindings.processCharFilter("pattern_replace", PatternReplaceCharFilterFactory.class); } @Override diff --git a/src/main/java/org/elasticsearch/index/analysis/PatternReplaceCharFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/PatternReplaceCharFilterFactory.java new file mode 100644 index 00000000000..507deeacaea --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/PatternReplaceCharFilterFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.Reader; +import java.util.regex.Pattern; + +@AnalysisSettingsRequired +public class PatternReplaceCharFilterFactory extends AbstractCharFilterFactory { + + private final Pattern pattern; + private final String replacement; + + @Inject + public PatternReplaceCharFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name); + + if (!Strings.hasLength(settings.get("pattern"))) { + throw new ElasticSearchIllegalArgumentException("pattern is missing for [" + name + "] char filter of type 'pattern_replace'"); + } + pattern = Pattern.compile(settings.get("pattern")); + + replacement = settings.get("replacement"); + if (!Strings.hasLength(replacement)) { + throw new ElasticSearchIllegalArgumentException("replacement is missing for [" + name + "] char filter of type 'pattern_replace'"); + } + } + + public Pattern getPattern() { + return pattern; + } + + public String getReplacement() { + return replacement; + } + + @Override + public Reader create(Reader tokenStream) { + return new PatternReplaceCharFilter(pattern, replacement, tokenStream); + } +} diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java b/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java index f0490d9fd83..02ce4fda878 100644 --- a/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java @@ -136,6 +136,14 @@ public class AnalysisModuleTests { analyzer = analysisService.analyzer("alias1").analyzer(); assertThat(analyzer, instanceOf(StandardAnalyzer.class)); + // check custom pattern replace filter + analyzer = analysisService.analyzer("custom3").analyzer(); + assertThat(analyzer, instanceOf(CustomAnalyzer.class)); + CustomAnalyzer custom3 = (CustomAnalyzer) analyzer; + PatternReplaceCharFilterFactory patternReplaceCharFilterFactory = (PatternReplaceCharFilterFactory) custom3.charFilters()[0]; + assertThat(patternReplaceCharFilterFactory.getPattern().pattern(), equalTo("sample(.*)")); + assertThat(patternReplaceCharFilterFactory.getReplacement(), equalTo("replacedSample $1")); + // check custom class name (my) analyzer = analysisService.analyzer("custom4").analyzer(); assertThat(analyzer, instanceOf(CustomAnalyzer.class)); diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.json b/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.json index 9aef36ce09d..37b1b2ac3f7 100644 --- a/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.json +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.json @@ -12,6 +12,11 @@ "escaped_tags":["xxx", "yyy"], "read_ahead":1024 }, + "my_pattern":{ + "type":"pattern_replace", + "pattern":"sample(.*)", + "replacement":"replacedSample $1" + }, "my_mapping":{ "type":"mapping", "mappings":["ph=>f", "qu=>q"] @@ -49,6 +54,10 @@ "tokenizer":"standard", "char_filter":["html_strip", "my_html"] }, + "custom3":{ + "tokenizer":"standard", + "char_filter":["my_pattern"] + }, "custom4":{ "tokenizer":"standard", "filter":["my"] diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.yml b/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.yml index dc31fb8639a..9bfc1ba5079 100644 --- a/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.yml +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/test1.yml @@ -8,6 +8,10 @@ index : type : html_strip escaped_tags : [xxx, yyy] read_ahead : 1024 + my_pattern : + type: pattern_replace + pattern: sample(.*) + replacement: replacedSample $1 my_mapping : type : mapping mappings : [ph=>f, qu=>q] @@ -35,6 +39,9 @@ index : custom2 : tokenizer : standard char_filter : [html_strip, my_html] + custom3 : + tokenizer : standard + char_filter : [my_pattern] custom4 : tokenizer : standard filter : [my]