From ac7c8cb650935f5b67343b28a88af8ea8db07467 Mon Sep 17 00:00:00 2001 From: kimchy Date: Sun, 25 Jul 2010 22:40:50 +0300 Subject: [PATCH] Analysis: Add pattern analyzer, closes #276. --- .idea/dictionaries/kimchy.xml | 1 + .../org/elasticsearch/common/regex/Regex.java | 59 +++++++++++++++ .../index/analysis/AnalysisModule.java | 2 + .../analysis/PatternAnalyzerProvider.java | 71 +++++++++++++++++++ .../index/analysis/AnalysisModuleTests.java | 2 +- .../elasticsearch/index/analysis/test1.json | 38 +++++----- 6 files changed, 153 insertions(+), 20 deletions(-) create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/common/regex/Regex.java create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java diff --git a/.idea/dictionaries/kimchy.xml b/.idea/dictionaries/kimchy.xml index 2574bef590f..ffad318b8dd 100644 --- a/.idea/dictionaries/kimchy.xml +++ b/.idea/dictionaries/kimchy.xml @@ -95,6 +95,7 @@ queryparser rackspace rebalance + regex reparse retrans retval diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/common/regex/Regex.java b/modules/elasticsearch/src/main/java/org/elasticsearch/common/regex/Regex.java new file mode 100644 index 00000000000..293bdf3313d --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/regex/Regex.java @@ -0,0 +1,59 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.regex; + +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.Strings; + +import java.util.regex.Pattern; + +/** + * @author kimchy (shay.banon) + */ +public class Regex { + + public static Pattern compile(String regex, String flags) { + int pFlags = 0; + if (flags == null) { + for (String s : Strings.delimitedListToStringArray(flags, "|")) { + if ("CASE_INSENSITIVE".equalsIgnoreCase(s)) { + pFlags |= Pattern.CASE_INSENSITIVE; + } else if ("MULTILINE".equalsIgnoreCase(s)) { + pFlags |= Pattern.MULTILINE; + } else if ("DOTALL".equalsIgnoreCase(s)) { + pFlags |= Pattern.DOTALL; + } else if ("UNICODE_CASE".equalsIgnoreCase(s)) { + pFlags |= Pattern.UNICODE_CASE; + } else if ("CANON_EQ".equalsIgnoreCase(s)) { + pFlags |= Pattern.CANON_EQ; + } else if ("UNIX_LINES".equalsIgnoreCase(s)) { + pFlags |= Pattern.UNIX_LINES; + } else if ("LITERAL".equalsIgnoreCase(s)) { + pFlags |= Pattern.LITERAL; + } else if ("COMMENTS".equalsIgnoreCase(s)) { + pFlags |= Pattern.COMMENTS; + } else { + throw new ElasticSearchIllegalArgumentException("Unknown regex flag [" + s + "] to compile [" + regex + "]"); + } + } + } + return Pattern.compile(regex, pFlags); + } +} diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index dd94c1714a6..032c1f3b2e5 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -286,6 +286,8 @@ public class AnalysisModule extends AbstractModule { } @Override public void processAnalyzers(AnalyzersBindings analyzersBindings) { + analyzersBindings.processAnalyzer("pattern", PatternAnalyzerProvider.class); + analyzersBindings.processAnalyzer("arabic", ArabicAnalyzerProvider.class); analyzersBindings.processAnalyzer("brazilian", BrazilianAnalyzerProvider.class); analyzersBindings.processAnalyzer("chinese", ChineseAnalyzerProvider.class); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java new file mode 100644 index 00000000000..40a4c070271 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PatternAnalyzerProvider.java @@ -0,0 +1,71 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.collect.ImmutableSet; +import org.elasticsearch.common.collect.Iterators; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.Set; +import java.util.regex.Pattern; + +/** + * @author kimchy (shay.banon) + */ +public class PatternAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final Set stopWords; + + private final PatternAnalyzer analyzer; + + @Inject public PatternAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name); + + boolean lowercase = settings.getAsBoolean("lowercase", true); + + String[] stopWords = settings.getAsArray("stopwords", null); + if (stopWords != null) { + this.stopWords = ImmutableSet.copyOf(Iterators.forArray(stopWords)); + } else { + this.stopWords = ImmutableSet.copyOf((Iterable) StopAnalyzer.ENGLISH_STOP_WORDS_SET); + } + + String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/); + if (sPattern == null) { + throw new ElasticSearchIllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set"); + } + Pattern pattern = Regex.compile(sPattern, settings.get("flags")); + + analyzer = new PatternAnalyzer(Lucene.ANALYZER_VERSION, pattern, lowercase, this.stopWords); + } + + @Override public PatternAnalyzer get() { + return analyzer; + } +} diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java index d4ed3756bb3..73143b99fd9 100644 --- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java +++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java @@ -33,7 +33,7 @@ import static org.hamcrest.MatcherAssert.*; import static org.hamcrest.Matchers.*; /** - * @author kimchy (Shay Banon) + * @author kimchy (shay.banon) */ public class AnalysisModuleTests { diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json index f5d49e7a289..9e8bf971b64 100644 --- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json +++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json @@ -1,29 +1,29 @@ { - index : { - analysis : { - tokenizer : { - standard : { - type : "standard" + "index" : { + "analysis" : { + "tokenizer" : { + "standard" : { + "type" : "standard" } }, - filter : { - stop : { - type : "stop", - stopwords : ["test-stop"] + "filter" : { + "stop" : { + "type" : "stop", + "stopwords" : ["test-stop"] }, - stop2 : { - type : "stop", - stopwords : ["stop2-1", "stop2-2"] + "stop2" : { + "type" : "stop", + "stopwords" : ["stop2-1", "stop2-2"] } }, - analyzer : { - standard : { - type : "standard", - stopwords : ["test1", "test2", "test3"] + "analyzer" : { + "standard" : { + "type" : "standard", + "stopwords" : ["test1", "test2", "test3"] }, - custom1 : { - tokenizer : "standard", - filter : ["stop", "stop2"] + "custom1" : { + "tokenizer" : "standard", + "filter" : ["stop", "stop2"] } } }