Analysis: Add pattern analyzer, closes #276.

This commit is contained in:
kimchy 2010-07-25 22:40:50 +03:00
parent 477a24efc6
commit ac7c8cb650
6 changed files with 153 additions and 20 deletions

View File

@ -95,6 +95,7 @@
<w>queryparser</w>
<w>rackspace</w>
<w>rebalance</w>
<w>regex</w>
<w>reparse</w>
<w>retrans</w>
<w>retval</w>

View File

@ -0,0 +1,59 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.regex;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.Strings;
import java.util.regex.Pattern;
/**
* @author kimchy (shay.banon)
*/
public class Regex {
public static Pattern compile(String regex, String flags) {
int pFlags = 0;
if (flags == null) {
for (String s : Strings.delimitedListToStringArray(flags, "|")) {
if ("CASE_INSENSITIVE".equalsIgnoreCase(s)) {
pFlags |= Pattern.CASE_INSENSITIVE;
} else if ("MULTILINE".equalsIgnoreCase(s)) {
pFlags |= Pattern.MULTILINE;
} else if ("DOTALL".equalsIgnoreCase(s)) {
pFlags |= Pattern.DOTALL;
} else if ("UNICODE_CASE".equalsIgnoreCase(s)) {
pFlags |= Pattern.UNICODE_CASE;
} else if ("CANON_EQ".equalsIgnoreCase(s)) {
pFlags |= Pattern.CANON_EQ;
} else if ("UNIX_LINES".equalsIgnoreCase(s)) {
pFlags |= Pattern.UNIX_LINES;
} else if ("LITERAL".equalsIgnoreCase(s)) {
pFlags |= Pattern.LITERAL;
} else if ("COMMENTS".equalsIgnoreCase(s)) {
pFlags |= Pattern.COMMENTS;
} else {
throw new ElasticSearchIllegalArgumentException("Unknown regex flag [" + s + "] to compile [" + regex + "]");
}
}
}
return Pattern.compile(regex, pFlags);
}
}

View File

@ -286,6 +286,8 @@ public class AnalysisModule extends AbstractModule {
}
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
analyzersBindings.processAnalyzer("pattern", PatternAnalyzerProvider.class);
analyzersBindings.processAnalyzer("arabic", ArabicAnalyzerProvider.class);
analyzersBindings.processAnalyzer("brazilian", BrazilianAnalyzerProvider.class);
analyzersBindings.processAnalyzer("chinese", ChineseAnalyzerProvider.class);

View File

@ -0,0 +1,71 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.collect.ImmutableSet;
import org.elasticsearch.common.collect.Iterators;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Set;
import java.util.regex.Pattern;
/**
* @author kimchy (shay.banon)
*/
public class PatternAnalyzerProvider extends AbstractIndexAnalyzerProvider<PatternAnalyzer> {
private final Set<String> stopWords;
private final PatternAnalyzer analyzer;
@Inject public PatternAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
boolean lowercase = settings.getAsBoolean("lowercase", true);
String[] stopWords = settings.getAsArray("stopwords", null);
if (stopWords != null) {
this.stopWords = ImmutableSet.copyOf(Iterators.forArray(stopWords));
} else {
this.stopWords = ImmutableSet.copyOf((Iterable<? extends String>) StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
if (sPattern == null) {
throw new ElasticSearchIllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
}
Pattern pattern = Regex.compile(sPattern, settings.get("flags"));
analyzer = new PatternAnalyzer(Lucene.ANALYZER_VERSION, pattern, lowercase, this.stopWords);
}
@Override public PatternAnalyzer get() {
return analyzer;
}
}

View File

@ -33,7 +33,7 @@ import static org.hamcrest.MatcherAssert.*;
import static org.hamcrest.Matchers.*;
/**
* @author kimchy (Shay Banon)
* @author kimchy (shay.banon)
*/
public class AnalysisModuleTests {

View File

@ -1,29 +1,29 @@
{
index : {
analysis : {
tokenizer : {
standard : {
type : "standard"
"index" : {
"analysis" : {
"tokenizer" : {
"standard" : {
"type" : "standard"
}
},
filter : {
stop : {
type : "stop",
stopwords : ["test-stop"]
"filter" : {
"stop" : {
"type" : "stop",
"stopwords" : ["test-stop"]
},
stop2 : {
type : "stop",
stopwords : ["stop2-1", "stop2-2"]
"stop2" : {
"type" : "stop",
"stopwords" : ["stop2-1", "stop2-2"]
}
},
analyzer : {
standard : {
type : "standard",
stopwords : ["test1", "test2", "test3"]
"analyzer" : {
"standard" : {
"type" : "standard",
"stopwords" : ["test1", "test2", "test3"]
},
custom1 : {
tokenizer : "standard",
filter : ["stop", "stop2"]
"custom1" : {
"tokenizer" : "standard",
"filter" : ["stop", "stop2"]
}
}
}