Analysis: expose Lucene MappingCharFilter.

This commit is contained in:
belevian 2011-07-08 14:42:24 +02:00 committed by kimchy
parent 9464208f83
commit 888194e903
5 changed files with 145 additions and 0 deletions

View File

@ -366,6 +366,10 @@ public class AnalysisModule extends AbstractModule {
}
private static class ExtendedProcessor extends AnalysisBinderProcessor {
@Override public void processCharFilters(CharFiltersBindings charFiltersBindings) {
charFiltersBindings.processCharFilter("mapping", MappingCharFilterFactory.class);
}
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);

View File

@ -0,0 +1,121 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@AnalysisSettingsRequired
public class MappingCharFilterFactory extends AbstractCharFilterFactory {
private final NormalizeCharMap normMap;
@Inject public MappingCharFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
List<String> rules = Analysis.getWordList(env, settings, "mappings");
if (rules == null) {
throw new ElasticSearchIllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
}
normMap = new NormalizeCharMap();
parseRules(rules, normMap);
}
@Override public CharStream create(CharStream tokenStream) {
return new MappingCharFilter(normMap, tokenStream);
}
// source => target
private static Pattern rulePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");
/**
* parses a list of MappingCharFilter style rules into a normalize char map
*/
private void parseRules(List<String> rules, NormalizeCharMap map) {
for (String rule : rules) {
Matcher m = rulePattern.matcher(rule);
if (!m.find())
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
String rhs = parseString(m.group(2).trim());
if (lhs == null || rhs == null)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");
map.add(lhs, rhs);
}
}
char[] out = new char[256];
private String parseString(String s) {
int readPos = 0;
int len = s.length();
int writePos = 0;
while (readPos < len) {
char c = s.charAt(readPos++);
if (c == '\\') {
if (readPos >= len)
throw new RuntimeException("Invalid escaped char in [" + s + "]");
c = s.charAt(readPos++);
switch (c) {
case '\\':
c = '\\';
break;
case 'n':
c = '\n';
break;
case 't':
c = '\t';
break;
case 'r':
c = '\r';
break;
case 'b':
c = '\b';
break;
case 'f':
c = '\f';
break;
case 'u':
if (readPos + 3 >= len)
throw new RuntimeException("Invalid escaped char in [" + s + "]");
c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
readPos += 4;
break;
}
}
out[writePos++] = c;
}
return new String(out, 0, writePos);
}
}

View File

@ -94,6 +94,12 @@ public class AnalysisModuleTests {
// html = (HtmlStripCharFilterFactory) custom2.charFilters()[1];
// assertThat(html.readAheadLimit(), equalTo(1024));
// verify characters mapping
analyzer = analysisService.analyzer("custom5").analyzer();
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer custom5 = (CustomAnalyzer) analyzer;
assertThat(custom5.tokenFilters()[0], instanceOf(MappingCharFilterFactory.class));
// verify aliases
analyzer = analysisService.analyzer("alias1").analyzer();
assertThat(analyzer, instanceOf(StandardAnalyzer.class));

View File

@ -11,6 +11,10 @@
"type" : "html_strip",
"escaped_tags" : ["xxx", "yyy"],
"read_ahead" : 1024
},
"my_mapping" : {
"type" : "mapping",
"mappings" : ["ph=>f", "qu=>q"]
}
},
"filter" : {
@ -57,6 +61,10 @@
"tokenizer" : "standard",
"filter" : ["my"]
},
"custom5" : {
"tokenizer" : "standard",
"char_filter" : ["my_mapping"]
},
"czechAnalyzerWithStemmer" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "stop", "czech_stem"]

View File

@ -8,6 +8,9 @@ index :
type : html_strip
escaped_tags : [xxx, yyy]
read_ahead : 1024
my_mapping :
type : mapping
mappings : [ph=>f, qu=>q]
filter :
stop :
type : stop
@ -41,6 +44,9 @@ index :
custom4 :
tokenizer : standard
filter : [my]
custom5 :
tokenizer : standard
char_filter : [my_mapping]
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]