Analysis: Add pattern analyzer, closes #276.
This commit is contained in:
parent
477a24efc6
commit
ac7c8cb650
|
@ -95,6 +95,7 @@
|
|||
<w>queryparser</w>
|
||||
<w>rackspace</w>
|
||||
<w>rebalance</w>
|
||||
<w>regex</w>
|
||||
<w>reparse</w>
|
||||
<w>retrans</w>
|
||||
<w>retval</w>
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.regex;
|
||||
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @author kimchy (shay.banon)
|
||||
*/
|
||||
public class Regex {
|
||||
|
||||
public static Pattern compile(String regex, String flags) {
|
||||
int pFlags = 0;
|
||||
if (flags == null) {
|
||||
for (String s : Strings.delimitedListToStringArray(flags, "|")) {
|
||||
if ("CASE_INSENSITIVE".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.CASE_INSENSITIVE;
|
||||
} else if ("MULTILINE".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.MULTILINE;
|
||||
} else if ("DOTALL".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.DOTALL;
|
||||
} else if ("UNICODE_CASE".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.UNICODE_CASE;
|
||||
} else if ("CANON_EQ".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.CANON_EQ;
|
||||
} else if ("UNIX_LINES".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.UNIX_LINES;
|
||||
} else if ("LITERAL".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.LITERAL;
|
||||
} else if ("COMMENTS".equalsIgnoreCase(s)) {
|
||||
pFlags |= Pattern.COMMENTS;
|
||||
} else {
|
||||
throw new ElasticSearchIllegalArgumentException("Unknown regex flag [" + s + "] to compile [" + regex + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
return Pattern.compile(regex, pFlags);
|
||||
}
|
||||
}
|
|
@ -286,6 +286,8 @@ public class AnalysisModule extends AbstractModule {
|
|||
}
|
||||
|
||||
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
|
||||
analyzersBindings.processAnalyzer("pattern", PatternAnalyzerProvider.class);
|
||||
|
||||
analyzersBindings.processAnalyzer("arabic", ArabicAnalyzerProvider.class);
|
||||
analyzersBindings.processAnalyzer("brazilian", BrazilianAnalyzerProvider.class);
|
||||
analyzersBindings.processAnalyzer("chinese", ChineseAnalyzerProvider.class);
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.collect.ImmutableSet;
|
||||
import org.elasticsearch.common.collect.Iterators;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.regex.Regex;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @author kimchy (shay.banon)
|
||||
*/
|
||||
public class PatternAnalyzerProvider extends AbstractIndexAnalyzerProvider<PatternAnalyzer> {
|
||||
|
||||
private final Set<String> stopWords;
|
||||
|
||||
private final PatternAnalyzer analyzer;
|
||||
|
||||
@Inject public PatternAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name);
|
||||
|
||||
boolean lowercase = settings.getAsBoolean("lowercase", true);
|
||||
|
||||
String[] stopWords = settings.getAsArray("stopwords", null);
|
||||
if (stopWords != null) {
|
||||
this.stopWords = ImmutableSet.copyOf(Iterators.forArray(stopWords));
|
||||
} else {
|
||||
this.stopWords = ImmutableSet.copyOf((Iterable<? extends String>) StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
}
|
||||
|
||||
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
|
||||
if (sPattern == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
|
||||
}
|
||||
Pattern pattern = Regex.compile(sPattern, settings.get("flags"));
|
||||
|
||||
analyzer = new PatternAnalyzer(Lucene.ANALYZER_VERSION, pattern, lowercase, this.stopWords);
|
||||
}
|
||||
|
||||
@Override public PatternAnalyzer get() {
|
||||
return analyzer;
|
||||
}
|
||||
}
|
|
@ -33,7 +33,7 @@ import static org.hamcrest.MatcherAssert.*;
|
|||
import static org.hamcrest.Matchers.*;
|
||||
|
||||
/**
|
||||
* @author kimchy (Shay Banon)
|
||||
* @author kimchy (shay.banon)
|
||||
*/
|
||||
public class AnalysisModuleTests {
|
||||
|
||||
|
|
|
@ -1,29 +1,29 @@
|
|||
{
|
||||
index : {
|
||||
analysis : {
|
||||
tokenizer : {
|
||||
standard : {
|
||||
type : "standard"
|
||||
"index" : {
|
||||
"analysis" : {
|
||||
"tokenizer" : {
|
||||
"standard" : {
|
||||
"type" : "standard"
|
||||
}
|
||||
},
|
||||
filter : {
|
||||
stop : {
|
||||
type : "stop",
|
||||
stopwords : ["test-stop"]
|
||||
"filter" : {
|
||||
"stop" : {
|
||||
"type" : "stop",
|
||||
"stopwords" : ["test-stop"]
|
||||
},
|
||||
stop2 : {
|
||||
type : "stop",
|
||||
stopwords : ["stop2-1", "stop2-2"]
|
||||
"stop2" : {
|
||||
"type" : "stop",
|
||||
"stopwords" : ["stop2-1", "stop2-2"]
|
||||
}
|
||||
},
|
||||
analyzer : {
|
||||
standard : {
|
||||
type : "standard",
|
||||
stopwords : ["test1", "test2", "test3"]
|
||||
"analyzer" : {
|
||||
"standard" : {
|
||||
"type" : "standard",
|
||||
"stopwords" : ["test1", "test2", "test3"]
|
||||
},
|
||||
custom1 : {
|
||||
tokenizer : "standard",
|
||||
filter : ["stop", "stop2"]
|
||||
"custom1" : {
|
||||
"tokenizer" : "standard",
|
||||
"filter" : ["stop", "stop2"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue