diff --git a/plugins/analysis-phonetic/README.md b/plugins/analysis-phonetic/README.md new file mode 100644 index 00000000000..2c0d50a65f6 --- /dev/null +++ b/plugins/analysis-phonetic/README.md @@ -0,0 +1,93 @@ +Phonetic Analysis for Elasticsearch +=================================== + +The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch. + +In order to install the plugin, simply run: + +```sh +bin/plugin install elasticsearch/elasticsearch-analysis-phonetic/2.5.0 +``` + + +| elasticsearch |Phonetic Analysis Plugin| Docs | +|---------------|-----------------------|------------| +| master | Build from source | See below | +| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) | +| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.5.0/#version-250-for-elasticsearch-15) | +| es-1.4 | 2.4.3 | [2.4.3](https://github.com/elasticsearch/elasticsearch-analysis-phonetic/tree/v2.4.3/#version-243-for-elasticsearch-14) | +| < 1.4.5 | 2.4.2 | [2.4.2](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.2/#version-242-for-elasticsearch-14) | +| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.1/#version-241-for-elasticsearch-14) | +| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.3.0/#phonetic-analysis-for-elasticsearch) | +| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.2.0/#phonetic-analysis-for-elasticsearch) | +| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.1.0/#phonetic-analysis-for-elasticsearch) | +| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.0.0/#phonetic-analysis-for-elasticsearch) | +| es-0.90 | 1.8.0 | [1.8.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v1.8.0/#phonetic-analysis-for-elasticsearch) | + +To build a `SNAPSHOT` version, you need to build it with Maven: + +```bash +mvn clean install +plugin --install analysis-phonetic \ + --url file:target/releases/elasticsearch-analysis-phonetic-X.X.X-SNAPSHOT.zip +``` + +## User guide + +A `phonetic` token filter that can be configured with different `encoder` types: +`metaphone`, `doublemetaphone`, `soundex`, `refinedsoundex`, +`caverphone1`, `caverphone2`, `cologne`, `nysiis`, +`koelnerphonetik`, `haasephonetik`, `beidermorse` + +The `replace` parameter (defaults to `true`) controls if the token processed +should be replaced with the encoded one (set it to `true`), or added (set it to `false`). + +```js +{ + "index" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "my_metaphone"] + } + }, + "filter" : { + "my_metaphone" : { + "type" : "phonetic", + "encoder" : "metaphone", + "replace" : false + } + } + } + } +} +``` + +Note that `beidermorse` does not support `replace` parameter. + + +Questions +--------- + +If you have questions or comments please use the [mailing list](https://groups.google.com/group/elasticsearch) instead +of Github Issues tracker. + +License +------- + + This software is licensed under the Apache 2 license, quoted below. + + Copyright 2009-2014 Elasticsearch + + Licensed under the Apache License, Version 2.0 (the "License"); you may not + use this file except in compliance with the License. You may obtain a copy of + the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations under + the License. diff --git a/plugins/analysis-phonetic/pom.xml b/plugins/analysis-phonetic/pom.xml new file mode 100644 index 00000000000..d4cc3e7b871 --- /dev/null +++ b/plugins/analysis-phonetic/pom.xml @@ -0,0 +1,40 @@ + + + 4.0.0 + + org.elasticsearch.plugin + elasticsearch-analysis-phonetic + + jar + Elasticsearch Phonetic Analysis plugin + The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch. + + + org.elasticsearch + elasticsearch-plugin + 2.0.0-SNAPSHOT + + + + + + + + + org.apache.lucene + lucene-analyzers-phonetic + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + + + diff --git a/plugins/analysis-phonetic/src/main/assemblies/plugin.xml b/plugins/analysis-phonetic/src/main/assemblies/plugin.xml new file mode 100644 index 00000000000..f5065e0a0d5 --- /dev/null +++ b/plugins/analysis-phonetic/src/main/assemblies/plugin.xml @@ -0,0 +1,26 @@ + + + plugin + + zip + + false + + + / + true + true + + org.elasticsearch:elasticsearch + + + + / + true + true + + org.apache.lucene:lucene-analyzers-phonetic + + + + diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticAnalysisBinderProcessor.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticAnalysisBinderProcessor.java new file mode 100644 index 00000000000..45d7634081e --- /dev/null +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticAnalysisBinderProcessor.java @@ -0,0 +1,30 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +/** + */ +public class PhoneticAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + + @Override + public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class); + } +} diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java new file mode 100644 index 00000000000..b23f311268a --- /dev/null +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java @@ -0,0 +1,131 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.commons.codec.Encoder; +import org.apache.commons.codec.language.*; +import org.apache.commons.codec.language.bm.Languages.LanguageSet; +import org.apache.commons.codec.language.bm.NameType; +import org.apache.commons.codec.language.bm.PhoneticEngine; +import org.apache.commons.codec.language.bm.RuleType; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.BeiderMorseFilter; +import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; +import org.apache.lucene.analysis.phonetic.PhoneticFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.analysis.phonetic.HaasePhonetik; +import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik; +import org.elasticsearch.index.analysis.phonetic.Nysiis; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.Arrays; +import java.util.HashSet; + +/** + * + */ +public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { + + private final Encoder encoder; + private final boolean replace; + private int maxcodelength; + private String[] languageset; + private NameType nametype; + private RuleType ruletype; + + @Inject + public PhoneticTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + this.languageset = null; + this.nametype = null; + this.ruletype = null; + this.maxcodelength = 0; + this.replace = settings.getAsBoolean("replace", true); + // weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default + String encodername = settings.get("encoder", "metaphone"); + if ("metaphone".equalsIgnoreCase(encodername)) { + this.encoder = new Metaphone(); + } else if ("soundex".equalsIgnoreCase(encodername)) { + this.encoder = new Soundex(); + } else if ("caverphone1".equalsIgnoreCase(encodername)) { + this.encoder = new Caverphone1(); + } else if ("caverphone2".equalsIgnoreCase(encodername)) { + this.encoder = new Caverphone2(); + } else if ("caverphone".equalsIgnoreCase(encodername)) { + this.encoder = new Caverphone2(); + } else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) { + this.encoder = new RefinedSoundex(); + } else if ("cologne".equalsIgnoreCase(encodername)) { + this.encoder = new ColognePhonetic(); + } else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) { + this.encoder = null; + this.maxcodelength = settings.getAsInt("max_code_len", 4); + } else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) { + this.encoder = null; + this.languageset = settings.getAsArray("languageset"); + String ruleType = settings.get("rule_type", "approx"); + if ("approx".equalsIgnoreCase(ruleType)) { + ruletype = RuleType.APPROX; + } else if ("exact".equalsIgnoreCase(ruleType)) { + ruletype = RuleType.EXACT; + } else { + throw new IllegalArgumentException("No matching rule type [" + ruleType + "] for beider morse encoder"); + } + String nameType = settings.get("name_type", "generic"); + if ("GENERIC".equalsIgnoreCase(nameType)) { + nametype = NameType.GENERIC; + } else if ("ASHKENAZI".equalsIgnoreCase(nameType)) { + nametype = NameType.ASHKENAZI; + } else if ("SEPHARDIC".equalsIgnoreCase(nameType)) { + nametype = NameType.SEPHARDIC; + } + } else if ("koelnerphonetik".equalsIgnoreCase(encodername)) { + this.encoder = new KoelnerPhonetik(); + } else if ("haasephonetik".equalsIgnoreCase(encodername)) { + this.encoder = new HaasePhonetik(); + } else if ("nysiis".equalsIgnoreCase(encodername)) { + this.encoder = new Nysiis(); + } else { + throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter"); + } + } + + @Override + public TokenStream create(TokenStream tokenStream) { + if (encoder == null) { + if (ruletype != null && nametype != null) { + if (languageset != null) { + final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset))); + return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages); + } + return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true)); + } + if (maxcodelength > 0) { + return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace); + } + } else { + return new PhoneticFilter(tokenStream, encoder, !replace); + } + throw new IllegalArgumentException("encoder error"); + } +} diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java new file mode 100644 index 00000000000..880bc00cace --- /dev/null +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java @@ -0,0 +1,71 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis.phonetic; + +/** + * Geänderter Algorithmus aus der Matching Toolbox von Rainer Schnell + * Java-Programmierung von Jörg Reiher + * + * Die Kölner Phonetik wurde für den Einsatz in Namensdatenbanken wie + * der Verwaltung eines Krankenhauses durch Martin Haase (Institut für + * Sprachwissenschaft, Universität zu Köln) und Kai Heitmann (Insitut für + * medizinische Statistik, Informatik und Epidemiologie, Köln) überarbeitet. + * M. Haase und K. Heitmann. Die Erweiterte Kölner Phonetik. 526, 2000. + * + * nach: Martin Wilz, Aspekte der Kodierung phonetischer Ähnlichkeiten + * in deutschen Eigennamen, Magisterarbeit. + * http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf + * + * @author Jörg Prante + */ +public class HaasePhonetik extends KoelnerPhonetik { + + private final static String[] HAASE_VARIATIONS_PATTERNS = {"OWN", "RB", "WSK", "A$", "O$", "SCH", + "GLI", "EAU$", "^CH", "AUX", "EUX", "ILLE"}; + private final static String[] HAASE_VARIATIONS_REPLACEMENTS = {"AUN", "RW", "RSK", "AR", "OW", "CH", + "LI", "O", "SCH", "O", "O", "I"}; + + /** + * + * @return + */ + @Override + protected String[] getPatterns() { + return HAASE_VARIATIONS_PATTERNS; + } + + /** + * + * @return + */ + @Override + protected String[] getReplacements() { + return HAASE_VARIATIONS_REPLACEMENTS; + } + + /** + * + * @return + */ + @Override + protected char getCode() { + return '9'; + } +} diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/KoelnerPhonetik.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/KoelnerPhonetik.java new file mode 100644 index 00000000000..a3190fa4686 --- /dev/null +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/KoelnerPhonetik.java @@ -0,0 +1,324 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis.phonetic; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Kölner Phonetik + * + * H.J. Postel, Die Kölner Phonetik. Ein Verfahren zu Identifizierung + * von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten 19 (1969), 925-931 + * + * Algorithmus aus der Matching Toolbox von Rainer Schnell + * Java-Programmierung von Jörg Reiher + * + * mit Änderungen von Jörg Prante + * + */ +public class KoelnerPhonetik implements StringEncoder { + + private static final String[] POSTEL_VARIATIONS_PATTERNS = {"AUN", "OWN", "RB", "RW", "WSK", "RSK"}; + private static final String[] POSTEL_VARIATIONS_REPLACEMENTS = {"OWN", "AUN", "RW", "RB", "RSK", "WSK"}; + private Pattern[] variationsPatterns; + private boolean primary = false; + private final Set csz = new HashSet(Arrays.asList( + 'C', 'S', 'Z')); + private final Set ckq = new HashSet(Arrays.asList( + 'C', 'K', 'Q')); + private final Set aouhkxq = new HashSet(Arrays.asList( + 'A', 'O', 'U', 'H', 'K', 'X', 'Q')); + private final Set ahkloqrux = new HashSet(Arrays.asList( + 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X')); + + /** + * Constructor for Kölner Phonetik + */ + public KoelnerPhonetik() { + init(); + } + + /** + * + * @param useOnlyPrimaryCode + */ + public KoelnerPhonetik(boolean useOnlyPrimaryCode) { + this(); + this.primary = useOnlyPrimaryCode; + } + + /** + * Get variation patterns + * + * @return string array of variations + */ + protected String[] getPatterns() { + return POSTEL_VARIATIONS_PATTERNS; + } + + /** + * + * @return + */ + protected String[] getReplacements() { + return POSTEL_VARIATIONS_REPLACEMENTS; + } + + /** + * + * @return + */ + protected char getCode() { + return '0'; + } + + /** + * + * @param o1 + * @param o2 + * @return + */ + public double getRelativeValue(Object o1, Object o2) { + String[] kopho1 = code(expandUmlauts(o1.toString().toUpperCase(Locale.GERMANY))); + String[] kopho2 = code(expandUmlauts(o2.toString().toUpperCase(Locale.GERMANY))); + for (int i = 0; i < kopho1.length; i++) { + for (int ii = 0; ii < kopho2.length; ii++) { + if (kopho1[i].equals(kopho2[ii])) { + return 1; + } + } + } + return 0; + } + + @Override + public Object encode(Object str) throws EncoderException { + return encode((String) str); + } + + @Override + public String encode(String str) throws EncoderException { + if (str == null) return null; + String[] s = code(str.toString()); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < s.length; i++) { + sb.append(s[i]); + if (i < s.length - 1) { + sb.append('_'); + } + } + return sb.toString(); + } + + + private void init() { + this.variationsPatterns = new Pattern[getPatterns().length]; + for (int i = 0; i < getPatterns().length; i++) { + this.variationsPatterns[i] = Pattern.compile(getPatterns()[i]); + } + } + + private String[] code(String str) { + List parts = partition(str); + String[] codes = new String[parts.size()]; + int i = 0; + for (String s : parts) { + codes[i++] = substitute(s); + } + return codes; + } + + private List partition(String str) { + String primaryForm = str; + List parts = new ArrayList(); + parts.add(primaryForm.replaceAll("[^\\p{L}\\p{N}]", "")); + if (!primary) { + List tmpParts = new ArrayList(); + tmpParts.addAll((Arrays.asList(str.split("[\\p{Z}\\p{C}\\p{P}]")))); + int numberOfParts = tmpParts.size(); + while (tmpParts.size() > 0) { + StringBuilder part = new StringBuilder(); + for (int i = 0; i < tmpParts.size(); i++) { + part.append(tmpParts.get(i)); + if (!(i + 1 == numberOfParts)) { + parts.add(part.toString()); + } + } + tmpParts.remove(0); + } + } + List variations = new ArrayList(); + for (int i = 0; i < parts.size(); i++) { + List variation = getVariations(parts.get(i)); + if (variation != null) { + variations.addAll(variation); + } + } + return variations; + } + + private List getVariations(String str) { + int position = 0; + List variations = new ArrayList(); + variations.add(""); + while (position < str.length()) { + int i = 0; + int substPos = -1; + while (substPos < position && i < getPatterns().length) { + Matcher m = variationsPatterns[i].matcher(str); + while (substPos < position && m.find()) { + substPos = m.start(); + } + i++; + } + if (substPos >= position) { + i--; + List varNew = new ArrayList(); + String prevPart = str.substring(position, substPos); + for (int ii = 0; ii < variations.size(); ii++) { + String tmp = variations.get(ii); + varNew.add(tmp.concat(prevPart + getReplacements()[i])); + variations.set(ii, variations.get(ii) + prevPart + getPatterns()[i]); + } + variations.addAll(varNew); + position = substPos + getPatterns()[i].length(); + } else { + for (int ii = 0; ii < variations.size(); ii++) { + variations.set(ii, variations.get(ii) + str.substring(position, str.length())); + } + position = str.length(); + } + } + return variations; + } + + private String substitute(String str) { + String s = expandUmlauts(str.toUpperCase(Locale.GERMAN)); + s = removeSequences(s); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char current = s.charAt(i); + char next = i + 1 < s.length() ? s.charAt(i + 1) : '_'; + char prev = i > 0 ? s.charAt(i - 1) : '_'; + switch (current) { + case 'A': + case 'E': + case 'I': + case 'J': + case 'Y': + case 'O': + case 'U': + if (i == 0 || ((i == 1) && prev == 'H')) { + sb.append(getCode()); + } + break; + case 'P': + sb.append(next == 'H' ? "33" : '1'); + break; + case 'B': + sb.append('1'); + break; + case 'D': + case 'T': + sb.append(csz.contains(next) ? '8' : '2'); + break; + case 'F': + case 'V': + case 'W': + sb.append('3'); + break; + case 'G': + case 'K': + case 'Q': + sb.append('4'); + break; + case 'C': + if (i == 0) { + sb.append(ahkloqrux.contains(next) ? '4' : '8'); + } else { + sb.append(aouhkxq.contains(next) ? '4' : '8'); + } + if (sb.length() >= 2 && sb.charAt(sb.length() - 2) == '8') { + sb.setCharAt(sb.length() - 1, '8'); + } + break; + case 'X': + sb.append(i < 1 || !ckq.contains(prev) ? "48" : '8'); + break; + case 'L': + sb.append('5'); + break; + case 'M': + case 'N': + sb.append('6'); + break; + case 'R': + sb.append('7'); + break; + case 'S': + case 'Z': + sb.append('8'); + break; + case 'H': + break; + } + } + s = sb.toString(); + s = removeSequences(s); + return s; + } + + /** + * + * @param str + * @return + */ + private String expandUmlauts(String str) { + return str.replaceAll("\u00C4", "AE").replaceAll("\u00D6", "OE").replaceAll("\u00DC", "UE"); + } + + /** + * + * @param str + * @return + */ + private String removeSequences(String str) { + if (str == null || str.length() == 0) { + return ""; + } + int i = 0, j = 0; + StringBuilder sb = new StringBuilder().append(str.charAt(i++)); + char c; + while (i < str.length()) { + c = str.charAt(i); + if (c != sb.charAt(j)) { + sb.append(c); + j++; + } + i++; + } + return sb.toString(); + } +} diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/Nysiis.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/Nysiis.java new file mode 100644 index 00000000000..3b85ef43915 --- /dev/null +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/Nysiis.java @@ -0,0 +1,329 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis.phonetic; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +import java.util.regex.Pattern; + +/** + * + * Taken from commons-codec trunk (unreleased yet) + * + * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate + * similar names, but can also be used as a general purpose scheme to find word + * with similar phonemes. + * + *

NYSIIS features an accuracy increase of 2.7% over the traditional Soundex + * algorithm.

+ * + *

Algorithm description: + *

+ * 1. Transcode first characters of name
+ *   1a. MAC ->   MCC
+ *   1b. KN  ->   NN
+ *   1c. K   ->   C
+ *   1d. PH  ->   FF
+ *   1e. PF  ->   FF
+ *   1f. SCH ->   SSS
+ * 2. Transcode last characters of name
+ *   2a. EE, IE          ->   Y
+ *   2b. DT,RT,RD,NT,ND  ->   D
+ * 3. First character of key = first character of name
+ * 4. Transcode remaining characters by following these rules, incrementing by one character each time
+ *   4a. EV  ->   AF  else A,E,I,O,U -> A
+ *   4b. Q   ->   G
+ *   4c. Z   ->   S
+ *   4d. M   ->   N
+ *   4e. KN  ->   N   else K -> C
+ *   4f. SCH ->   SSS
+ *   4g. PH  ->   FF
+ *   4h. H   ->   If previous or next is nonvowel, previous
+ *   4i. W   ->   If previous is vowel, previous
+ *   4j. Add current to key if current != last key character
+ * 5. If last character is S, remove it
+ * 6. If last characters are AY, replace with Y
+ * 7. If last character is A, remove it
+ * 8. Collapse all strings of repeated characters
+ * 9. Add original first character of name as first character of key
+ * 

+ * + * @see NYSIIS on Wikipedia + * @see NYSIIS on dropby.com + * + */ +public class Nysiis implements StringEncoder { + + private static final char[] CHARS_A = new char[]{'A'}; + private static final char[] CHARS_AF = new char[]{'A', 'F'}; + private static final char[] CHARS_C = new char[]{'C'}; + private static final char[] CHARS_FF = new char[]{'F', 'F'}; + private static final char[] CHARS_G = new char[]{'G'}; + private static final char[] CHARS_N = new char[]{'N'}; + private static final char[] CHARS_NN = new char[]{'N', 'N'}; + private static final char[] CHARS_S = new char[]{'S'}; + private static final char[] CHARS_SSS = new char[]{'S', 'S', 'S'}; + private static final Pattern PAT_MAC = Pattern.compile("^MAC"); + private static final Pattern PAT_KN = Pattern.compile("^KN"); + private static final Pattern PAT_K = Pattern.compile("^K"); + private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)"); + private static final Pattern PAT_SCH = Pattern.compile("^SCH"); + private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$"); + private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$"); + private static final char SPACE = ' '; + private static final int TRUE_LENGTH = 6; + + /** + * Tests if the given character is a vowel. + * + * @param c the character to test + * @return {@code true} if the character is a vowel, {@code false} otherwise + */ + private static boolean isVowel(final char c) { + return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'; + } + + /** + * Transcodes the remaining parts of the String. The method operates on a + * sliding window, looking at 4 characters at a time: [i-1, i, i+1, i+2]. + * + * @param prev the previous character + * @param curr the current character + * @param next the next character + * @param aNext the after next character + * @return a transcoded array of characters, starting from the current + * position + */ + private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) { + // 1. EV -> AF + if (curr == 'E' && next == 'V') { + return CHARS_AF; + } + + // A, E, I, O, U -> A + if (isVowel(curr)) { + return CHARS_A; + } + + // 2. Q -> G, Z -> S, M -> N + if (curr == 'Q') { + return CHARS_G; + } else if (curr == 'Z') { + return CHARS_S; + } else if (curr == 'M') { + return CHARS_N; + } + + // 3. KN -> NN else K -> C + if (curr == 'K') { + if (next == 'N') { + return CHARS_NN; + } else { + return CHARS_C; + } + } + + // 4. SCH -> SSS + if (curr == 'S' && next == 'C' && aNext == 'H') { + return CHARS_SSS; + } + + // PH -> FF + if (curr == 'P' && next == 'H') { + return CHARS_FF; + } + + // 5. H -> If previous or next is a non vowel, previous. + if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) { + return new char[]{prev}; + } + + // 6. W -> If previous is vowel, previous. + if (curr == 'W' && isVowel(prev)) { + return new char[]{prev}; + } + + return new char[]{curr}; + } + /** + * Indicates the strict mode. + */ + private final boolean strict; + + /** + * Creates an instance of the {@link Nysiis} encoder with strict mode + * (original form), i.e. encoded strings have a maximum length of 6. + */ + public Nysiis() { + this(true); + } + + /** + * Create an instance of the {@link Nysiis} encoder with the specified + * strict mode: + * + *
  • {@code true}: encoded strings have a maximum length of 6
  • {@code false}: + * encoded strings may have arbitrary length
+ * + * @param strict the strict mode + */ + public Nysiis(final boolean strict) { + this.strict = strict; + } + + /** + * Encodes an Object using the NYSIIS algorithm. This method is provided in + * order to satisfy the requirements of the Encoder interface, and will + * throw an {@link EncoderException} if the supplied object is not of type + * {@link String}. + * + * @param obj Object to encode + * @return An object (or a {@link String}) containing the NYSIIS code which + * corresponds to the given String. + * @throws EncoderException if the parameter supplied is not of a {@link String} + * @throws IllegalArgumentException if a character is not mapped + */ + @Override + public Object encode(Object obj) throws EncoderException { + if (!(obj instanceof String)) { + throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String"); + } + return this.nysiis((String) obj); + } + + /** + * Encodes a String using the NYSIIS algorithm. + * + * @param str A String object to encode + * @return A Nysiis code corresponding to the String supplied + * @throws IllegalArgumentException if a character is not mapped + */ + @Override + public String encode(String str) { + return this.nysiis(str); + } + + /** + * Indicates the strict mode for this {@link Nysiis} encoder. + * + * @return {@code true} if the encoder is configured for strict mode, {@code false} + * otherwise + */ + public boolean isStrict() { + return this.strict; + } + + /** + * Retrieves the NYSIIS code for a given String object. + * + * @param str String to encode using the NYSIIS algorithm + * @return A NYSIIS code for the String supplied + */ + public String nysiis(String str) { + if (str == null) { + return null; + } + + // Use the same clean rules as Soundex + str = clean(str); + + if (str.length() == 0) { + return str; + } + + // Translate first characters of name: + // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS + str = PAT_MAC.matcher(str).replaceFirst("MCC"); + str = PAT_KN.matcher(str).replaceFirst("NN"); + str = PAT_K.matcher(str).replaceFirst("C"); + str = PAT_PH_PF.matcher(str).replaceFirst("FF"); + str = PAT_SCH.matcher(str).replaceFirst("SSS"); + + // Translate last characters of name: + // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D + str = PAT_EE_IE.matcher(str).replaceFirst("Y"); + str = PAT_DT_ETC.matcher(str).replaceFirst("D"); + + // First character of key = first character of name. + StringBuffer key = new StringBuffer(str.length()); + key.append(str.charAt(0)); + + // Transcode remaining characters, incrementing by one character each time + final char[] chars = str.toCharArray(); + final int len = chars.length; + + for (int i = 1; i < len; i++) { + final char next = i < len - 1 ? chars[i + 1] : SPACE; + final char aNext = i < len - 2 ? chars[i + 2] : SPACE; + final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext); + System.arraycopy(transcoded, 0, chars, i, transcoded.length); + + // only append the current char to the key if it is different from the last one + if (chars[i] != chars[i - 1]) { + key.append(chars[i]); + } + } + + if (key.length() > 1) { + char lastChar = key.charAt(key.length() - 1); + + // If last character is S, remove it. + if (lastChar == 'S') { + key.deleteCharAt(key.length() - 1); + lastChar = key.charAt(key.length() - 1); + } + + if (key.length() > 2) { + final char last2Char = key.charAt(key.length() - 2); + // If last characters are AY, replace with Y. + if (last2Char == 'A' && lastChar == 'Y') { + key.deleteCharAt(key.length() - 2); + } + } + + // If last character is A, remove it. + if (lastChar == 'A') { + key.deleteCharAt(key.length() - 1); + } + } + + final String string = key.toString(); + return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string; + } + + static String clean(String str) { + if (str == null || str.length() == 0) { + return str; + } + int len = str.length(); + char[] chars = new char[len]; + int count = 0; + for (int i = 0; i < len; i++) { + if (Character.isLetter(str.charAt(i))) { + chars[count++] = str.charAt(i); + } + } + if (count == len) { + return str.toUpperCase(java.util.Locale.ENGLISH); + } + return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH); + } +} diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/plugin/analysis/AnalysisPhoneticPlugin.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/plugin/analysis/AnalysisPhoneticPlugin.java new file mode 100644 index 00000000000..dacea45e049 --- /dev/null +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/plugin/analysis/AnalysisPhoneticPlugin.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.analysis; + +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.PhoneticAnalysisBinderProcessor; +import org.elasticsearch.plugins.AbstractPlugin; + +/** + */ +public class AnalysisPhoneticPlugin extends AbstractPlugin { + + @Override + public String name() { + return "analysis-phonetic"; + } + + @Override + public String description() { + return "Phonetic analysis support"; + } + + public void onModule(AnalysisModule module) { + module.addProcessor(new PhoneticAnalysisBinderProcessor()); + } +} + diff --git a/plugins/analysis-phonetic/src/main/resources/es-plugin.properties b/plugins/analysis-phonetic/src/main/resources/es-plugin.properties new file mode 100644 index 00000000000..cc52b051102 --- /dev/null +++ b/plugins/analysis-phonetic/src/main/resources/es-plugin.properties @@ -0,0 +1,3 @@ +plugin=org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin +version=${project.version} +lucene=${lucene.version} diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java new file mode 100644 index 00000000000..45c3d7cf0ec --- /dev/null +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java @@ -0,0 +1,72 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.hamcrest.MatcherAssert; +import org.junit.Test; + +import static org.elasticsearch.common.settings.Settings.settingsBuilder; +import static org.hamcrest.Matchers.instanceOf; + +/** + */ +public class SimplePhoneticAnalysisTests extends ElasticsearchTestCase { + + @Test + public void testPhoneticTokenFilterFactory() { + Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml") + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir()) + .build(); + AnalysisService analysisService = testSimpleConfiguration(settings); + TokenFilterFactory filterFactory = analysisService.tokenFilter("phonetic"); + MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class)); + } + + private AnalysisService testSimpleConfiguration(Settings settings) { + Index index = new Index("test"); + + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), + new EnvironmentModule(new Environment(settings)), + new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, settings), + new IndexNameModule(index), + new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class)) + .addProcessor(new PhoneticAnalysisBinderProcessor())).createChildInjector(parentInjector); + + AnalysisService analysisService = injector.getInstance(AnalysisService.class); + return analysisService; + } +} diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticIntegrationTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticIntegrationTests.java new file mode 100644 index 00000000000..7f74879e3ce --- /dev/null +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticIntegrationTests.java @@ -0,0 +1,108 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.plugins.PluginsService; +import org.elasticsearch.test.ElasticsearchIntegrationTest; +import org.junit.Test; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.CoreMatchers.notNullValue; + +@ElasticsearchIntegrationTest.ClusterScope(numDataNodes = 1, scope = ElasticsearchIntegrationTest.Scope.SUITE) +public class SimplePhoneticIntegrationTests extends ElasticsearchIntegrationTest { + + @Override + protected Settings nodeSettings(int nodeOrdinal) { + return Settings.builder() + .put(super.nodeSettings(nodeOrdinal)) + .put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true) + .build(); + } + + @Override + public Settings indexSettings() { + Settings settings = Settings.builder() + .put(super.indexSettings()) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_metaphone") + .put("index.analysis.filter.my_metaphone.type", "phonetic") + .put("index.analysis.filter.my_metaphone.encoder", "metaphone") + .put("index.analysis.filter.my_metaphone.replace", false) + .build(); + + return settings; + } + + @Test + public void testPhoneticAnalyzer() throws ExecutionException, InterruptedException { + createIndex("test"); + ensureGreen("test"); + AnalyzeResponse response = client().admin().indices() + .prepareAnalyze("hello world") + .setIndex("test") + .setAnalyzer("my_analyzer") + .execute().get(); + + assertThat(response, notNullValue()); + assertThat(response.getTokens().size(), is(4)); + assertThat(response.getTokens().get(0).getTerm(), is("HL")); + assertThat(response.getTokens().get(1).getTerm(), is("hello")); + assertThat(response.getTokens().get(2).getTerm(), is("WRLT")); + assertThat(response.getTokens().get(3).getTerm(), is("world")); + } + + @Test + public void testPhoneticAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException { + createIndex("test"); + ensureGreen("test"); + final XContentBuilder mapping = jsonBuilder().startObject() + .startObject("type") + .startObject("properties") + .startObject("foo") + .field("type", "string") + .field("analyzer", "my_analyzer") + .endObject() + .endObject() + .endObject() + .endObject(); + + client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get(); + + index("test", "type", "1", "foo", "hello world"); + refresh(); + + SearchResponse response = client().prepareSearch("test").setQuery( + QueryBuilders.matchQuery("foo", "helllo") + ).execute().actionGet(); + + assertThat(response.getHits().getTotalHits(), is(1L)); + } + +} diff --git a/plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml b/plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml new file mode 100644 index 00000000000..41a4e3fc59f --- /dev/null +++ b/plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml @@ -0,0 +1,30 @@ +index: + analysis: + filter: + doublemetaphonefilter: + type: phonetic + encoder: doublemetaphone + metaphonefilter: + type: phonetic + encoder: metaphone + soundexfilter: + type: phonetic + encoder: soundex + refinedsoundexfilter: + type: phonetic + encoder: refinedsoundex + caverphonefilter: + type: phonetic + encoder: caverphone + beidermorsefilter: + type: phonetic + encoder: beidermorse + koelnerphonetikfilter: + type: phonetic + encoder: koelnerphonetik + haasephonetikfilter: + type: phonetic + encoder: haasephonetik + nysiisfilter: + type: phonetic + encoder: nysiis