migrate branch for analysis-phonetic
This commit is contained in:
@ -0,0 +1,93 @@
Phonetic Analysis for Elasticsearch
The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch.
In order to install the plugin, simply run:
bin/plugin install elasticsearch/elasticsearch-analysis-phonetic/2.5.0
| elasticsearch |Phonetic Analysis Plugin| Docs |
| master | Build from source | See below |
| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) |
| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.5.0/#version-250-for-elasticsearch-15) |
| es-1.4 | 2.4.3 | [2.4.3](https://github.com/elasticsearch/elasticsearch-analysis-phonetic/tree/v2.4.3/#version-243-for-elasticsearch-14) |
| < 1.4.5 | 2.4.2 | [2.4.2](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.2/#version-242-for-elasticsearch-14) |
| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.1/#version-241-for-elasticsearch-14) |
| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.3.0/#phonetic-analysis-for-elasticsearch) |
| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.2.0/#phonetic-analysis-for-elasticsearch) |
| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.1.0/#phonetic-analysis-for-elasticsearch) |
| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.0.0/#phonetic-analysis-for-elasticsearch) |
| es-0.90 | 1.8.0 | [1.8.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v1.8.0/#phonetic-analysis-for-elasticsearch) |
To build a `SNAPSHOT` version, you need to build it with Maven:
mvn clean install
plugin --install analysis-phonetic \
--url file:target/releases/elasticsearch-analysis-phonetic-X.X.X-SNAPSHOT.zip
## User guide
A `phonetic` token filter that can be configured with different `encoder` types:
`metaphone`, `doublemetaphone`, `soundex`, `refinedsoundex`,
`caverphone1`, `caverphone2`, `cologne`, `nysiis`,
`koelnerphonetik`, `haasephonetik`, `beidermorse`
The `replace` parameter (defaults to `true`) controls if the token processed
should be replaced with the encoded one (set it to `true`), or added (set it to `false`).
"index" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "my_metaphone"]
"filter" : {
"my_metaphone" : {
"type" : "phonetic",
"encoder" : "metaphone",
"replace" : false
Note that `beidermorse` does not support `replace` parameter.
If you have questions or comments please use the [mailing list](https://groups.google.com/group/elasticsearch) instead
of Github Issues tracker.
This software is licensed under the Apache 2 license, quoted below.
Copyright 2009-2014 Elasticsearch <http://www.elasticsearch.org>
Licensed under the Apache License, Version 2.0 (the "License"); you may not
use this file except in compliance with the License. You may obtain a copy of
the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and limitations under
the License.
@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<name>Elasticsearch Phonetic Analysis plugin</name>
<description>The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch.</description>
<!-- You can add any specific project property here -->
@ -0,0 +1,26 @@
<?xml version="1.0"?>
@ -0,0 +1,30 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis;
public class PhoneticAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
@ -0,0 +1,131 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.*;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.NameType;
import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.commons.codec.language.bm.RuleType;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.phonetic.HaasePhonetik;
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
import org.elasticsearch.index.analysis.phonetic.Nysiis;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Arrays;
import java.util.HashSet;
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
private final Encoder encoder;
private final boolean replace;
private int maxcodelength;
private String[] languageset;
private NameType nametype;
private RuleType ruletype;
public PhoneticTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
this.languageset = null;
this.nametype = null;
this.ruletype = null;
this.maxcodelength = 0;
this.replace = settings.getAsBoolean("replace", true);
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
String encodername = settings.get("encoder", "metaphone");
if ("metaphone".equalsIgnoreCase(encodername)) {
this.encoder = new Metaphone();
} else if ("soundex".equalsIgnoreCase(encodername)) {
this.encoder = new Soundex();
} else if ("caverphone1".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone1();
} else if ("caverphone2".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
} else if ("caverphone".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
} else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) {
this.encoder = new RefinedSoundex();
} else if ("cologne".equalsIgnoreCase(encodername)) {
this.encoder = new ColognePhonetic();
} else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) {
this.encoder = null;
this.maxcodelength = settings.getAsInt("max_code_len", 4);
} else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) {
this.encoder = null;
this.languageset = settings.getAsArray("languageset");
String ruleType = settings.get("rule_type", "approx");
if ("approx".equalsIgnoreCase(ruleType)) {
ruletype = RuleType.APPROX;
} else if ("exact".equalsIgnoreCase(ruleType)) {
ruletype = RuleType.EXACT;
} else {
throw new IllegalArgumentException("No matching rule type [" + ruleType + "] for beider morse encoder");
String nameType = settings.get("name_type", "generic");
if ("GENERIC".equalsIgnoreCase(nameType)) {
nametype = NameType.GENERIC;
} else if ("ASHKENAZI".equalsIgnoreCase(nameType)) {
nametype = NameType.ASHKENAZI;
} else if ("SEPHARDIC".equalsIgnoreCase(nameType)) {
nametype = NameType.SEPHARDIC;
} else if ("koelnerphonetik".equalsIgnoreCase(encodername)) {
this.encoder = new KoelnerPhonetik();
} else if ("haasephonetik".equalsIgnoreCase(encodername)) {
this.encoder = new HaasePhonetik();
} else if ("nysiis".equalsIgnoreCase(encodername)) {
this.encoder = new Nysiis();
} else {
throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
public TokenStream create(TokenStream tokenStream) {
if (encoder == null) {
if (ruletype != null && nametype != null) {
if (languageset != null) {
final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset)));
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
if (maxcodelength > 0) {
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
} else {
return new PhoneticFilter(tokenStream, encoder, !replace);
throw new IllegalArgumentException("encoder error");
@ -0,0 +1,71 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis.phonetic;
* Geänderter Algorithmus aus der Matching Toolbox von Rainer Schnell
* Java-Programmierung von Jörg Reiher
* Die Kölner Phonetik wurde für den Einsatz in Namensdatenbanken wie
* der Verwaltung eines Krankenhauses durch Martin Haase (Institut für
* Sprachwissenschaft, Universität zu Köln) und Kai Heitmann (Insitut für
* medizinische Statistik, Informatik und Epidemiologie, Köln) überarbeitet.
* M. Haase und K. Heitmann. Die Erweiterte Kölner Phonetik. 526, 2000.
* nach: Martin Wilz, Aspekte der Kodierung phonetischer Ähnlichkeiten
* in deutschen Eigennamen, Magisterarbeit.
* http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
* @author <a href="mailto:joergprante@gmail.com">Jörg Prante</a>
public class HaasePhonetik extends KoelnerPhonetik {
private final static String[] HAASE_VARIATIONS_PATTERNS = {"OWN", "RB", "WSK", "A$", "O$", "SCH",
"GLI", "EAU$", "^CH", "AUX", "EUX", "ILLE"};
private final static String[] HAASE_VARIATIONS_REPLACEMENTS = {"AUN", "RW", "RSK", "AR", "OW", "CH",
"LI", "O", "SCH", "O", "O", "I"};
* @return
protected String[] getPatterns() {
* @return
protected String[] getReplacements() {
* @return
protected char getCode() {
return '9';
@ -0,0 +1,324 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis.phonetic;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
* Kölner Phonetik
* H.J. Postel, Die Kölner Phonetik. Ein Verfahren zu Identifizierung
* von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten 19 (1969), 925-931
* Algorithmus aus der Matching Toolbox von Rainer Schnell
* Java-Programmierung von Jörg Reiher
* mit Änderungen von Jörg Prante
public class KoelnerPhonetik implements StringEncoder {
private static final String[] POSTEL_VARIATIONS_PATTERNS = {"AUN", "OWN", "RB", "RW", "WSK", "RSK"};
private static final String[] POSTEL_VARIATIONS_REPLACEMENTS = {"OWN", "AUN", "RW", "RB", "RSK", "WSK"};
private Pattern[] variationsPatterns;
private boolean primary = false;
private final Set<Character> csz = new HashSet(Arrays.asList(
'C', 'S', 'Z'));
private final Set<Character> ckq = new HashSet(Arrays.asList(
'C', 'K', 'Q'));
private final Set<Character> aouhkxq = new HashSet(Arrays.asList(
'A', 'O', 'U', 'H', 'K', 'X', 'Q'));
private final Set<Character> ahkloqrux = new HashSet(Arrays.asList(
'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'));
* Constructor for Kölner Phonetik
public KoelnerPhonetik() {
* @param useOnlyPrimaryCode
public KoelnerPhonetik(boolean useOnlyPrimaryCode) {
this.primary = useOnlyPrimaryCode;
* Get variation patterns
* @return string array of variations
protected String[] getPatterns() {
* @return
protected String[] getReplacements() {
* @return
protected char getCode() {
return '0';
* @param o1
* @param o2
* @return
public double getRelativeValue(Object o1, Object o2) {
String[] kopho1 = code(expandUmlauts(o1.toString().toUpperCase(Locale.GERMANY)));
String[] kopho2 = code(expandUmlauts(o2.toString().toUpperCase(Locale.GERMANY)));
for (int i = 0; i < kopho1.length; i++) {
for (int ii = 0; ii < kopho2.length; ii++) {
if (kopho1[i].equals(kopho2[ii])) {
return 1;
return 0;
public Object encode(Object str) throws EncoderException {
return encode((String) str);
public String encode(String str) throws EncoderException {
if (str == null) return null;
String[] s = code(str.toString());
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
if (i < s.length - 1) {
return sb.toString();
private void init() {
this.variationsPatterns = new Pattern[getPatterns().length];
for (int i = 0; i < getPatterns().length; i++) {
this.variationsPatterns[i] = Pattern.compile(getPatterns()[i]);
private String[] code(String str) {
List<String> parts = partition(str);
String[] codes = new String[parts.size()];
int i = 0;
for (String s : parts) {
codes[i++] = substitute(s);
return codes;
private List<String> partition(String str) {
String primaryForm = str;
List<String> parts = new ArrayList();
parts.add(primaryForm.replaceAll("[^\\p{L}\\p{N}]", ""));
if (!primary) {
List<String> tmpParts = new ArrayList();
int numberOfParts = tmpParts.size();
while (tmpParts.size() > 0) {
StringBuilder part = new StringBuilder();
for (int i = 0; i < tmpParts.size(); i++) {
if (!(i + 1 == numberOfParts)) {
List<String> variations = new ArrayList();
for (int i = 0; i < parts.size(); i++) {
List variation = getVariations(parts.get(i));
if (variation != null) {
return variations;
private List getVariations(String str) {
int position = 0;
List<String> variations = new ArrayList();
while (position < str.length()) {
int i = 0;
int substPos = -1;
while (substPos < position && i < getPatterns().length) {
Matcher m = variationsPatterns[i].matcher(str);
while (substPos < position && m.find()) {
substPos = m.start();
if (substPos >= position) {
List<String> varNew = new ArrayList();
String prevPart = str.substring(position, substPos);
for (int ii = 0; ii < variations.size(); ii++) {
String tmp = variations.get(ii);
varNew.add(tmp.concat(prevPart + getReplacements()[i]));
variations.set(ii, variations.get(ii) + prevPart + getPatterns()[i]);
position = substPos + getPatterns()[i].length();
} else {
for (int ii = 0; ii < variations.size(); ii++) {
variations.set(ii, variations.get(ii) + str.substring(position, str.length()));
position = str.length();
return variations;
private String substitute(String str) {
String s = expandUmlauts(str.toUpperCase(Locale.GERMAN));
s = removeSequences(s);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char current = s.charAt(i);
char next = i + 1 < s.length() ? s.charAt(i + 1) : '_';
char prev = i > 0 ? s.charAt(i - 1) : '_';
switch (current) {
case 'A':
case 'E':
case 'I':
case 'J':
case 'Y':
case 'O':
case 'U':
if (i == 0 || ((i == 1) && prev == 'H')) {
case 'P':
sb.append(next == 'H' ? "33" : '1');
case 'B':
case 'D':
case 'T':
sb.append(csz.contains(next) ? '8' : '2');
case 'F':
case 'V':
case 'W':
case 'G':
case 'K':
case 'Q':
case 'C':
if (i == 0) {
sb.append(ahkloqrux.contains(next) ? '4' : '8');
} else {
sb.append(aouhkxq.contains(next) ? '4' : '8');
if (sb.length() >= 2 && sb.charAt(sb.length() - 2) == '8') {
sb.setCharAt(sb.length() - 1, '8');
case 'X':
sb.append(i < 1 || !ckq.contains(prev) ? "48" : '8');
case 'L':
case 'M':
case 'N':
case 'R':
case 'S':
case 'Z':
case 'H':
s = sb.toString();
s = removeSequences(s);
return s;
* @param str
* @return
private String expandUmlauts(String str) {
return str.replaceAll("\u00C4", "AE").replaceAll("\u00D6", "OE").replaceAll("\u00DC", "UE");
* @param str
* @return
private String removeSequences(String str) {
if (str == null || str.length() == 0) {
return "";
int i = 0, j = 0;
StringBuilder sb = new StringBuilder().append(str.charAt(i++));
char c;
while (i < str.length()) {
c = str.charAt(i);
if (c != sb.charAt(j)) {
return sb.toString();
@ -0,0 +1,329 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis.phonetic;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import java.util.regex.Pattern;
* Taken from commons-codec trunk (unreleased yet)
* Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate
* similar names, but can also be used as a general purpose scheme to find word
* with similar phonemes.
* <p> NYSIIS features an accuracy increase of 2.7% over the traditional Soundex
* algorithm. </p>
* <p>Algorithm description:
* <pre>
* 1. Transcode first characters of name
* 1a. MAC -> MCC
* 1b. KN -> NN
* 1c. K -> C
* 1d. PH -> FF
* 1e. PF -> FF
* 1f. SCH -> SSS
* 2. Transcode last characters of name
* 2a. EE, IE -> Y
* 2b. DT,RT,RD,NT,ND -> D
* 3. First character of key = first character of name
* 4. Transcode remaining characters by following these rules, incrementing by one character each time
* 4a. EV -> AF else A,E,I,O,U -> A
* 4b. Q -> G
* 4c. Z -> S
* 4d. M -> N
* 4e. KN -> N else K -> C
* 4f. SCH -> SSS
* 4g. PH -> FF
* 4h. H -> If previous or next is nonvowel, previous
* 4i. W -> If previous is vowel, previous
* 4j. Add current to key if current != last key character
* 5. If last character is S, remove it
* 6. If last characters are AY, replace with Y
* 7. If last character is A, remove it
* 8. Collapse all strings of repeated characters
* 9. Add original first character of name as first character of key
* </pre></p>
* @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
* @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
public class Nysiis implements StringEncoder {
private static final char[] CHARS_A = new char[]{'A'};
private static final char[] CHARS_AF = new char[]{'A', 'F'};
private static final char[] CHARS_C = new char[]{'C'};
private static final char[] CHARS_FF = new char[]{'F', 'F'};
private static final char[] CHARS_G = new char[]{'G'};
private static final char[] CHARS_N = new char[]{'N'};
private static final char[] CHARS_NN = new char[]{'N', 'N'};
private static final char[] CHARS_S = new char[]{'S'};
private static final char[] CHARS_SSS = new char[]{'S', 'S', 'S'};
private static final Pattern PAT_MAC = Pattern.compile("^MAC");
private static final Pattern PAT_KN = Pattern.compile("^KN");
private static final Pattern PAT_K = Pattern.compile("^K");
private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
private static final Pattern PAT_SCH = Pattern.compile("^SCH");
private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
private static final char SPACE = ' ';
private static final int TRUE_LENGTH = 6;
* Tests if the given character is a vowel.
* @param c the character to test
* @return {@code true} if the character is a vowel, {@code false} otherwise
private static boolean isVowel(final char c) {
return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
* Transcodes the remaining parts of the String. The method operates on a
* sliding window, looking at 4 characters at a time: [i-1, i, i+1, i+2].
* @param prev the previous character
* @param curr the current character
* @param next the next character
* @param aNext the after next character
* @return a transcoded array of characters, starting from the current
* position
private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
// 1. EV -> AF
if (curr == 'E' && next == 'V') {
return CHARS_AF;
// A, E, I, O, U -> A
if (isVowel(curr)) {
return CHARS_A;
// 2. Q -> G, Z -> S, M -> N
if (curr == 'Q') {
return CHARS_G;
} else if (curr == 'Z') {
return CHARS_S;
} else if (curr == 'M') {
return CHARS_N;
// 3. KN -> NN else K -> C
if (curr == 'K') {
if (next == 'N') {
return CHARS_NN;
} else {
return CHARS_C;
// 4. SCH -> SSS
if (curr == 'S' && next == 'C' && aNext == 'H') {
return CHARS_SSS;
// PH -> FF
if (curr == 'P' && next == 'H') {
return CHARS_FF;
// 5. H -> If previous or next is a non vowel, previous.
if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
return new char[]{prev};
// 6. W -> If previous is vowel, previous.
if (curr == 'W' && isVowel(prev)) {
return new char[]{prev};
return new char[]{curr};
* Indicates the strict mode.
private final boolean strict;
* Creates an instance of the {@link Nysiis} encoder with strict mode
* (original form), i.e. encoded strings have a maximum length of 6.
public Nysiis() {
* Create an instance of the {@link Nysiis} encoder with the specified
* strict mode:
* <ul> <li>{@code true}: encoded strings have a maximum length of 6</li> <li>{@code false}:
* encoded strings may have arbitrary length</li> </ul>
* @param strict the strict mode
public Nysiis(final boolean strict) {
this.strict = strict;
* Encodes an Object using the NYSIIS algorithm. This method is provided in
* order to satisfy the requirements of the Encoder interface, and will
* throw an {@link EncoderException} if the supplied object is not of type
* {@link String}.
* @param obj Object to encode
* @return An object (or a {@link String}) containing the NYSIIS code which
* corresponds to the given String.
* @throws EncoderException if the parameter supplied is not of a {@link String}
* @throws IllegalArgumentException if a character is not mapped
public Object encode(Object obj) throws EncoderException {
if (!(obj instanceof String)) {
throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
return this.nysiis((String) obj);
* Encodes a String using the NYSIIS algorithm.
* @param str A String object to encode
* @return A Nysiis code corresponding to the String supplied
* @throws IllegalArgumentException if a character is not mapped
public String encode(String str) {
return this.nysiis(str);
* Indicates the strict mode for this {@link Nysiis} encoder.
* @return {@code true} if the encoder is configured for strict mode, {@code false}
* otherwise
public boolean isStrict() {
return this.strict;
* Retrieves the NYSIIS code for a given String object.
* @param str String to encode using the NYSIIS algorithm
* @return A NYSIIS code for the String supplied
public String nysiis(String str) {
if (str == null) {
return null;
// Use the same clean rules as Soundex
str = clean(str);
if (str.length() == 0) {
return str;
// Translate first characters of name:
// MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
str = PAT_MAC.matcher(str).replaceFirst("MCC");
str = PAT_KN.matcher(str).replaceFirst("NN");
str = PAT_K.matcher(str).replaceFirst("C");
str = PAT_PH_PF.matcher(str).replaceFirst("FF");
str = PAT_SCH.matcher(str).replaceFirst("SSS");
// Translate last characters of name:
// EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
str = PAT_EE_IE.matcher(str).replaceFirst("Y");
str = PAT_DT_ETC.matcher(str).replaceFirst("D");
// First character of key = first character of name.
StringBuffer key = new StringBuffer(str.length());
// Transcode remaining characters, incrementing by one character each time
final char[] chars = str.toCharArray();
final int len = chars.length;
for (int i = 1; i < len; i++) {
final char next = i < len - 1 ? chars[i + 1] : SPACE;
final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
System.arraycopy(transcoded, 0, chars, i, transcoded.length);
// only append the current char to the key if it is different from the last one
if (chars[i] != chars[i - 1]) {
if (key.length() > 1) {
char lastChar = key.charAt(key.length() - 1);
// If last character is S, remove it.
if (lastChar == 'S') {
key.deleteCharAt(key.length() - 1);
lastChar = key.charAt(key.length() - 1);
if (key.length() > 2) {
final char last2Char = key.charAt(key.length() - 2);
// If last characters are AY, replace with Y.
if (last2Char == 'A' && lastChar == 'Y') {
key.deleteCharAt(key.length() - 2);
// If last character is A, remove it.
if (lastChar == 'A') {
key.deleteCharAt(key.length() - 1);
final String string = key.toString();
return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
static String clean(String str) {
if (str == null || str.length() == 0) {
return str;
int len = str.length();
char[] chars = new char[len];
int count = 0;
for (int i = 0; i < len; i++) {
if (Character.isLetter(str.charAt(i))) {
chars[count++] = str.charAt(i);
if (count == len) {
return str.toUpperCase(java.util.Locale.ENGLISH);
return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
@ -0,0 +1,44 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.plugin.analysis;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.PhoneticAnalysisBinderProcessor;
import org.elasticsearch.plugins.AbstractPlugin;
public class AnalysisPhoneticPlugin extends AbstractPlugin {
public String name() {
return "analysis-phonetic";
public String description() {
return "Phonetic analysis support";
public void onModule(AnalysisModule module) {
module.addProcessor(new PhoneticAnalysisBinderProcessor());
@ -0,0 +1,3 @@
@ -0,0 +1,72 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.hamcrest.MatcherAssert;
import org.junit.Test;
import static org.elasticsearch.common.settings.Settings.settingsBuilder;
import static org.hamcrest.Matchers.instanceOf;
public class SimplePhoneticAnalysisTests extends ElasticsearchTestCase {
public void testPhoneticTokenFilterFactory() {
Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml")
.put("path.home", createTempDir())
AnalysisService analysisService = testSimpleConfiguration(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("phonetic");
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
private AnalysisService testSimpleConfiguration(Settings settings) {
Index index = new Index("test");
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
new EnvironmentModule(new Environment(settings)),
new IndicesAnalysisModule()).createInjector();
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(index, settings),
new IndexNameModule(index),
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))
.addProcessor(new PhoneticAnalysisBinderProcessor())).createChildInjector(parentInjector);
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
return analysisService;
@ -0,0 +1,108 @@
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.elasticsearch.index.analysis;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.plugins.PluginsService;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.junit.Test;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.notNullValue;
@ElasticsearchIntegrationTest.ClusterScope(numDataNodes = 1, scope = ElasticsearchIntegrationTest.Scope.SUITE)
public class SimplePhoneticIntegrationTests extends ElasticsearchIntegrationTest {
protected Settings nodeSettings(int nodeOrdinal) {
return Settings.builder()
.put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true)
public Settings indexSettings() {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_metaphone")
.put("index.analysis.filter.my_metaphone.type", "phonetic")
.put("index.analysis.filter.my_metaphone.encoder", "metaphone")
.put("index.analysis.filter.my_metaphone.replace", false)
return settings;
public void testPhoneticAnalyzer() throws ExecutionException, InterruptedException {
AnalyzeResponse response = client().admin().indices()
.prepareAnalyze("hello world")
assertThat(response, notNullValue());
assertThat(response.getTokens().size(), is(4));
assertThat(response.getTokens().get(0).getTerm(), is("HL"));
assertThat(response.getTokens().get(1).getTerm(), is("hello"));
assertThat(response.getTokens().get(2).getTerm(), is("WRLT"));
assertThat(response.getTokens().get(3).getTerm(), is("world"));
public void testPhoneticAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException {
final XContentBuilder mapping = jsonBuilder().startObject()
.field("type", "string")
.field("analyzer", "my_analyzer")
index("test", "type", "1", "foo", "hello world");
SearchResponse response = client().prepareSearch("test").setQuery(
QueryBuilders.matchQuery("foo", "helllo")
assertThat(response.getHits().getTotalHits(), is(1L));
@ -0,0 +1,30 @@
type: phonetic
encoder: doublemetaphone
type: phonetic
encoder: metaphone
type: phonetic
encoder: soundex
type: phonetic
encoder: refinedsoundex
type: phonetic
encoder: caverphone
type: phonetic
encoder: beidermorse
type: phonetic
encoder: koelnerphonetik
type: phonetic
encoder: haasephonetik
type: phonetic
encoder: nysiis
Reference in New Issue