Analysis: Add pattern analyzer, closes #276.
This commit is contained in:
parent
477a24efc6
commit
ac7c8cb650
|
@ -95,6 +95,7 @@
|
||||||
<w>queryparser</w>
|
<w>queryparser</w>
|
||||||
<w>rackspace</w>
|
<w>rackspace</w>
|
||||||
<w>rebalance</w>
|
<w>rebalance</w>
|
||||||
|
<w>regex</w>
|
||||||
<w>reparse</w>
|
<w>reparse</w>
|
||||||
<w>retrans</w>
|
<w>retrans</w>
|
||||||
<w>retval</w>
|
<w>retval</w>
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.common.regex;
|
||||||
|
|
||||||
|
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||||
|
import org.elasticsearch.common.Strings;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author kimchy (shay.banon)
|
||||||
|
*/
|
||||||
|
public class Regex {
|
||||||
|
|
||||||
|
public static Pattern compile(String regex, String flags) {
|
||||||
|
int pFlags = 0;
|
||||||
|
if (flags == null) {
|
||||||
|
for (String s : Strings.delimitedListToStringArray(flags, "|")) {
|
||||||
|
if ("CASE_INSENSITIVE".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.CASE_INSENSITIVE;
|
||||||
|
} else if ("MULTILINE".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.MULTILINE;
|
||||||
|
} else if ("DOTALL".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.DOTALL;
|
||||||
|
} else if ("UNICODE_CASE".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.UNICODE_CASE;
|
||||||
|
} else if ("CANON_EQ".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.CANON_EQ;
|
||||||
|
} else if ("UNIX_LINES".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.UNIX_LINES;
|
||||||
|
} else if ("LITERAL".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.LITERAL;
|
||||||
|
} else if ("COMMENTS".equalsIgnoreCase(s)) {
|
||||||
|
pFlags |= Pattern.COMMENTS;
|
||||||
|
} else {
|
||||||
|
throw new ElasticSearchIllegalArgumentException("Unknown regex flag [" + s + "] to compile [" + regex + "]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Pattern.compile(regex, pFlags);
|
||||||
|
}
|
||||||
|
}
|
|
@ -286,6 +286,8 @@ public class AnalysisModule extends AbstractModule {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
|
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
|
||||||
|
analyzersBindings.processAnalyzer("pattern", PatternAnalyzerProvider.class);
|
||||||
|
|
||||||
analyzersBindings.processAnalyzer("arabic", ArabicAnalyzerProvider.class);
|
analyzersBindings.processAnalyzer("arabic", ArabicAnalyzerProvider.class);
|
||||||
analyzersBindings.processAnalyzer("brazilian", BrazilianAnalyzerProvider.class);
|
analyzersBindings.processAnalyzer("brazilian", BrazilianAnalyzerProvider.class);
|
||||||
analyzersBindings.processAnalyzer("chinese", ChineseAnalyzerProvider.class);
|
analyzersBindings.processAnalyzer("chinese", ChineseAnalyzerProvider.class);
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
|
||||||
|
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||||
|
import org.elasticsearch.common.collect.ImmutableSet;
|
||||||
|
import org.elasticsearch.common.collect.Iterators;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.lucene.Lucene;
|
||||||
|
import org.elasticsearch.common.regex.Regex;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author kimchy (shay.banon)
|
||||||
|
*/
|
||||||
|
public class PatternAnalyzerProvider extends AbstractIndexAnalyzerProvider<PatternAnalyzer> {
|
||||||
|
|
||||||
|
private final Set<String> stopWords;
|
||||||
|
|
||||||
|
private final PatternAnalyzer analyzer;
|
||||||
|
|
||||||
|
@Inject public PatternAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name);
|
||||||
|
|
||||||
|
boolean lowercase = settings.getAsBoolean("lowercase", true);
|
||||||
|
|
||||||
|
String[] stopWords = settings.getAsArray("stopwords", null);
|
||||||
|
if (stopWords != null) {
|
||||||
|
this.stopWords = ImmutableSet.copyOf(Iterators.forArray(stopWords));
|
||||||
|
} else {
|
||||||
|
this.stopWords = ImmutableSet.copyOf((Iterable<? extends String>) StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
|
||||||
|
if (sPattern == null) {
|
||||||
|
throw new ElasticSearchIllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
|
||||||
|
}
|
||||||
|
Pattern pattern = Regex.compile(sPattern, settings.get("flags"));
|
||||||
|
|
||||||
|
analyzer = new PatternAnalyzer(Lucene.ANALYZER_VERSION, pattern, lowercase, this.stopWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public PatternAnalyzer get() {
|
||||||
|
return analyzer;
|
||||||
|
}
|
||||||
|
}
|
|
@ -33,7 +33,7 @@ import static org.hamcrest.MatcherAssert.*;
|
||||||
import static org.hamcrest.Matchers.*;
|
import static org.hamcrest.Matchers.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author kimchy (Shay Banon)
|
* @author kimchy (shay.banon)
|
||||||
*/
|
*/
|
||||||
public class AnalysisModuleTests {
|
public class AnalysisModuleTests {
|
||||||
|
|
||||||
|
|
|
@ -1,29 +1,29 @@
|
||||||
{
|
{
|
||||||
index : {
|
"index" : {
|
||||||
analysis : {
|
"analysis" : {
|
||||||
tokenizer : {
|
"tokenizer" : {
|
||||||
standard : {
|
"standard" : {
|
||||||
type : "standard"
|
"type" : "standard"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
filter : {
|
"filter" : {
|
||||||
stop : {
|
"stop" : {
|
||||||
type : "stop",
|
"type" : "stop",
|
||||||
stopwords : ["test-stop"]
|
"stopwords" : ["test-stop"]
|
||||||
},
|
},
|
||||||
stop2 : {
|
"stop2" : {
|
||||||
type : "stop",
|
"type" : "stop",
|
||||||
stopwords : ["stop2-1", "stop2-2"]
|
"stopwords" : ["stop2-1", "stop2-2"]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
analyzer : {
|
"analyzer" : {
|
||||||
standard : {
|
"standard" : {
|
||||||
type : "standard",
|
"type" : "standard",
|
||||||
stopwords : ["test1", "test2", "test3"]
|
"stopwords" : ["test1", "test2", "test3"]
|
||||||
},
|
},
|
||||||
custom1 : {
|
"custom1" : {
|
||||||
tokenizer : "standard",
|
"tokenizer" : "standard",
|
||||||
filter : ["stop", "stop2"]
|
"filter" : ["stop", "stop2"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue