update to Lucene Phonetic 3.6, added Nysiis, KoelnerPhonetik, HaasePhonetik, added simple test

This commit is contained in:
Jörg Prante 2012-04-22 12:08:05 +02:00
parent dbd1257e0e
commit 1338bcc936
10 changed files with 862 additions and 271 deletions

View File

@ -5,19 +5,25 @@ The Phonetic Analysis plugin integrates phonetic token filter analysis with elas
In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-phonetic/1.1.0`.
---------------------------------------------
| Phonetic Analysis Plugin | ElasticSearch |
---------------------------------------------
| master | 0.19 -> master |
---------------------------------------------
| 1.1.0 | 0.19 -> master |
---------------------------------------------
| 1.0.0 | 0.18 |
---------------------------------------------
-----------------------------------------------
| Phonetic Analysis Plugin | ElasticSearch |
-----------------------------------------------
| master | 0.19.2 -> master |
-----------------------------------------------
| 1.2.0 | 0.19.2 -> master |
-----------------------------------------------
| 1.1.0 | 0.19 |
-----------------------------------------------
| 1.0.0 | 0.18 |
-----------------------------------------------
A `phonetic` token filter that can be configured with different `encoder` types: `metaphone`, `soundex`, `caverphone`, `refined_soundex`, `double_metaphone` (uses "commons codec":http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html).
A `phonetic` token filter that can be configured with different `encoder` types:
`metaphone`, `doublemetaphone`, `soundex`, `refinedsoundex`,
`caverphone1`, `caverphone2`, `cologne`, `nysiis`,
`koelnerphonetik`, `haasephonetik`
The `replace` parameter (defaults to `true`) controls if the token processed should be replaced with the encoded one (set it to `true`), or added (set it to `false`).
The `replace` parameter (defaults to `true`) controls if the token processed
should be replaced with the encoded one (set it to `true`), or added (set it to `false`).
{
"index" : {

12
pom.xml
View File

@ -6,7 +6,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-phonetic</artifactId>
<version>1.2.0-SNAPSHOT</version>
<version>1.2.0</version>
<packaging>jar</packaging>
<description>Phonetic Analysis for ElasticSearch</description>
<inceptionYear>2009</inceptionYear>
@ -31,7 +31,7 @@
</parent>
<properties>
<elasticsearch.version>0.19.0.RC3</elasticsearch.version>
<elasticsearch.version>0.19.2</elasticsearch.version>
</properties>
<repositories>
@ -46,9 +46,9 @@
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.6</version>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-phonetic</artifactId>
<version>3.6.0</version>
<scope>compile</scope>
</dependency>
@ -95,7 +95,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.11</version>
<version>2.12</version>
<configuration>
<includes>
<include>**/*Tests.java</include>

View File

@ -1,111 +0,0 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
import java.util.LinkedList;
public final class DoubleMetaphoneFilter extends TokenFilter {
private static final String TOKEN_TYPE = "DoubleMetaphone";
private final LinkedList<State> remainingTokens = new LinkedList<State>();
private final DoubleMetaphone encoder;
private final boolean inject;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
public DoubleMetaphoneFilter(TokenStream input, DoubleMetaphone encoder, boolean inject) {
super(input);
this.encoder = encoder;
this.inject = inject;
}
@Override
public boolean incrementToken() throws IOException {
for (; ; ) {
if (!remainingTokens.isEmpty()) {
// clearAttributes(); // not currently necessary
restoreState(remainingTokens.removeFirst());
return true;
}
if (!input.incrementToken()) return false;
int len = termAtt.length();
if (len == 0) return true; // pass through zero length terms
int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement();
String v = termAtt.toString();
String primaryPhoneticValue = encoder.doubleMetaphone(v);
String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
// a flag to lazily save state if needed... this avoids a save/restore when only
// one token will be generated.
boolean saveState = inject;
if (primaryPhoneticValue != null && primaryPhoneticValue.length() > 0 && !primaryPhoneticValue.equals(v)) {
if (saveState) {
remainingTokens.addLast(captureState());
}
posAtt.setPositionIncrement(firstAlternativeIncrement);
firstAlternativeIncrement = 0;
termAtt.setEmpty().append(primaryPhoneticValue);
saveState = true;
}
if (alternatePhoneticValue != null && alternatePhoneticValue.length() > 0
&& !alternatePhoneticValue.equals(primaryPhoneticValue)
&& !primaryPhoneticValue.equals(v)) {
if (saveState) {
remainingTokens.addLast(captureState());
saveState = false;
}
posAtt.setPositionIncrement(firstAlternativeIncrement);
termAtt.setEmpty().append(alternatePhoneticValue);
saveState = true;
}
// Just one token to return, so no need to capture/restore
// any state, simply return it.
if (remainingTokens.isEmpty()) {
return true;
}
if (saveState) {
remainingTokens.addLast(captureState());
}
}
}
@Override
public void reset() throws IOException {
input.reset();
remainingTokens.clear();
}
}

View File

@ -1,100 +0,0 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.commons.codec.Encoder;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
/**
* Create tokens for phonetic matches. See:
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
*/
// LUCENE MONITOR - No need for it in Lucene 3.6
public class PhoneticFilter extends TokenFilter {
protected boolean inject = true;
protected Encoder encoder = null;
protected String name = null;
protected State save = null;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
super(in);
this.encoder = encoder;
this.name = name;
this.inject = inject;
}
@Override
public boolean incrementToken() throws IOException {
if (save != null) {
// clearAttributes(); // not currently necessary
restoreState(save);
save = null;
return true;
}
if (!input.incrementToken()) return false;
// pass through zero-length terms
if (termAtt.length() == 0) return true;
String value = termAtt.toString();
String phonetic = null;
try {
String v = encoder.encode(value).toString();
if (v.length() > 0 && !value.equals(v)) phonetic = v;
} catch (Exception ignored) {
} // just use the direct text
if (phonetic == null) return true;
if (!inject) {
// just modify this token
termAtt.setEmpty().append(phonetic);
return true;
}
// We need to return both the original and the phonetic tokens.
// to avoid a orig=captureState() change_to_phonetic() saved=captureState() restoreState(orig)
// we return the phonetic alternative first
int origOffset = posAtt.getPositionIncrement();
posAtt.setPositionIncrement(0);
save = captureState();
posAtt.setPositionIncrement(origOffset);
termAtt.setEmpty().append(phonetic);
return true;
}
@Override
public void reset() throws IOException {
input.reset();
save = null;
}
}

View File

@ -16,87 +16,120 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.*;
import org.apache.commons.codec.language.bm.BeiderMorseEncoder;
import org.apache.commons.codec.language.Caverphone1;
import org.apache.commons.codec.language.Caverphone2;
import org.apache.commons.codec.language.ColognePhonetic;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.NameType;
import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.commons.codec.language.bm.RuleType;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.phonetic.HaasePhonetik;
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
import org.elasticsearch.index.analysis.phonetic.Nysiis;
import org.elasticsearch.index.settings.IndexSettings;
/**
*
*/
@AnalysisSettingsRequired
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
private final Encoder encoder;
private final boolean replace;
private int maxcodelength;
private String[] languageset;
private NameType nametype;
private RuleType ruletype;
@Inject
public PhoneticTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
this.languageset = null;
this.nametype = null;
this.ruletype = null;
this.maxcodelength = 0;
this.replace = settings.getAsBoolean("replace", true);
String encoder = settings.get("encoder");
if (encoder == null) {
throw new ElasticSearchIllegalArgumentException("encoder must be set on phonetic token filter");
}
if ("metaphone".equalsIgnoreCase(encoder)) {
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
String encodername = settings.get("encoder", "metaphone");
if ("metaphone".equalsIgnoreCase(encodername)) {
this.encoder = new Metaphone();
} else if ("soundex".equalsIgnoreCase(encoder)) {
} else if ("soundex".equalsIgnoreCase(encodername)) {
this.encoder = new Soundex();
} else if ("caverphone1".equalsIgnoreCase(encoder)) {
} else if ("caverphone1".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone1();
} else if ("caverphone2".equalsIgnoreCase(encoder)) {
} else if ("caverphone2".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
} else if ("caverphone".equalsIgnoreCase(encoder)) {
} else if ("caverphone".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
} else if ("refined_soundex".equalsIgnoreCase(encoder) || "refinedSoundex".equalsIgnoreCase(encoder)) {
} else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) {
this.encoder = new RefinedSoundex();
} else if ("cologne".equalsIgnoreCase(encoder)) {
} else if ("cologne".equalsIgnoreCase(encodername)) {
this.encoder = new ColognePhonetic();
} else if ("double_metaphone".equalsIgnoreCase(encoder) || "doubleMetaphone".equalsIgnoreCase(encoder)) {
DoubleMetaphone doubleMetaphone = new DoubleMetaphone();
doubleMetaphone.setMaxCodeLen(settings.getAsInt("max_code_len", doubleMetaphone.getMaxCodeLen()));
this.encoder = doubleMetaphone;
} else if ("bm".equalsIgnoreCase(encoder) || "beider_morse".equalsIgnoreCase(encoder)) {
BeiderMorseEncoder bm = new BeiderMorseEncoder();
} else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) {
this.encoder = null;
this.maxcodelength = settings.getAsInt("max_code_len", 4);
} else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) {
this.encoder = null;
this.languageset = settings.getAsArray("languageset");
String ruleType = settings.get("rule_type", "approx");
if ("approx".equalsIgnoreCase(ruleType)) {
bm.setRuleType(RuleType.APPROX);
ruletype = RuleType.APPROX;
} else if ("exact".equalsIgnoreCase(ruleType)) {
bm.setRuleType(RuleType.EXACT);
ruletype = RuleType.EXACT;
} else {
throw new ElasticSearchIllegalArgumentException("No matching rule type [" + ruleType + "] for beider morse encoder");
}
String nameType = settings.get("name_type", "generic");
if ("GENERIC".equalsIgnoreCase(nameType)) {
bm.setNameType(NameType.GENERIC);
nametype = NameType.GENERIC;
} else if ("ASHKENAZI".equalsIgnoreCase(nameType)) {
bm.setNameType(NameType.ASHKENAZI);
nametype = NameType.ASHKENAZI;
} else if ("SEPHARDIC".equalsIgnoreCase(nameType)) {
bm.setNameType(NameType.SEPHARDIC);
nametype = NameType.SEPHARDIC;
}
this.encoder = bm;
} else if ("koelnerphonetik".equalsIgnoreCase(encodername)) {
this.encoder = new KoelnerPhonetik();
} else if ("haasephonetik".equalsIgnoreCase(encodername)) {
this.encoder = new HaasePhonetik();
} else if ("nysiis".equalsIgnoreCase(encodername)) {
this.encoder = new Nysiis();
} else {
throw new ElasticSearchIllegalArgumentException("unknown encoder [" + encoder + "] for phonetic token filter");
throw new ElasticSearchIllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (encoder instanceof DoubleMetaphone) {
return new DoubleMetaphoneFilter(tokenStream, (DoubleMetaphone) encoder, !replace);
if (encoder == null) {
if (ruletype != null && nametype != null) {
if (languageset != null) {
final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset)));
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
}
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
}
if (maxcodelength > 0) {
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
}
} else {
return new PhoneticFilter(tokenStream, encoder, !replace);
}
return new org.elasticsearch.index.analysis.PhoneticFilter(tokenStream, encoder, name(), !replace);
throw new ElasticSearchIllegalArgumentException("encoder error");
}
}

View File

@ -0,0 +1,70 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.phonetic;
/**
* Ge&auml;nderter Algorithmus aus der Matching Toolbox von Rainer Schnell
* Java-Programmierung von J&ouml;rg Reiher
*
* Die Kölner Phonetik wurde für den Einsatz in Namensdatenbanken wie
* der Verwaltung eines Krankenhauses durch Martin Haase (Institut für
* Sprachwissenschaft, Universität zu Köln) und Kai Heitmann (Insitut für
* medizinische Statistik, Informatik und Epidemiologie, Köln) überarbeitet.
* M. Haase und K. Heitmann. Die Erweiterte Kölner Phonetik. 526, 2000.
*
* nach: Martin Wilz, Aspekte der Kodierung phonetischer Ähnlichkeiten
* in deutschen Eigennamen, Magisterarbeit.
* http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
*
* @author <a href="mailto:joergprante@gmail.com">J&ouml;rg Prante</a>
*/
public class HaasePhonetik extends KoelnerPhonetik {
private final static String[] HAASE_VARIATIONS_PATTERNS = {"OWN", "RB", "WSK", "A$", "O$", "SCH",
"GLI", "EAU$", "^CH", "AUX", "EUX", "ILLE"};
private final static String[] HAASE_VARIATIONS_REPLACEMENTS = {"AUN", "RW", "RSK", "AR", "OW", "CH",
"LI", "O", "SCH", "O", "O", "I"};
/**
*
* @return
*/
@Override
protected String[] getPatterns() {
return HAASE_VARIATIONS_PATTERNS;
}
/**
*
* @return
*/
@Override
protected String[] getReplacements() {
return HAASE_VARIATIONS_REPLACEMENTS;
}
/**
*
* @return
*/
@Override
protected char getCode() {
return '9';
}
}

View File

@ -0,0 +1,327 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.phonetic;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
/**
* K&ouml;lner Phonetik
*
* H.J. Postel, Die K&ouml;lner Phonetik. Ein Verfahren zu Identifizierung
* von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten 19 (1969), 925-931
*
* Algorithmus aus der Matching Toolbox von Rainer Schnell
* Java-Programmierung von J&ouml;rg Reiher
*
* mit &Auml;nderungen von Jörg Prante
*
*/
public class KoelnerPhonetik implements StringEncoder {
private static final String[] POSTEL_VARIATIONS_PATTERNS = {"AUN", "OWN", "RB", "RW", "WSK", "RSK"};
private static final String[] POSTEL_VARIATIONS_REPLACEMENTS = {"OWN", "AUN", "RW", "RB", "RSK", "WSK"};
private Pattern[] variationsPatterns;
private boolean primary = false;
private final Set<Character> csz = new HashSet(Arrays.asList(
'C', 'S', 'Z'));
private final Set<Character> ckq = new HashSet(Arrays.asList(
'C', 'K', 'Q'));
private final Set<Character> aouhkxq = new HashSet(Arrays.asList(
'A', 'O', 'U', 'H', 'K', 'X', 'Q'));
private final Set<Character> ahkloqrux = new HashSet(Arrays.asList(
'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'));
/**
* Constructor for Kölner Phonetik
*/
public KoelnerPhonetik() {
init();
}
/**
*
* @param useOnlyPrimaryCode
*/
public KoelnerPhonetik(boolean useOnlyPrimaryCode) {
this();
this.primary = useOnlyPrimaryCode;
}
/**
* Get variation patterns
*
* @return string array of variations
*/
protected String[] getPatterns() {
return POSTEL_VARIATIONS_PATTERNS;
}
/**
*
* @return
*/
protected String[] getReplacements() {
return POSTEL_VARIATIONS_REPLACEMENTS;
}
/**
*
* @return
*/
protected char getCode() {
return '0';
}
/**
*
* @param o1
* @param o2
* @return
*/
public double getRelativeValue(Object o1, Object o2) {
String[] kopho1 = code(expandUmlauts(o1.toString().toUpperCase(Locale.GERMANY)));
String[] kopho2 = code(expandUmlauts(o2.toString().toUpperCase(Locale.GERMANY)));
for (int i = 0; i < kopho1.length; i++) {
for (int ii = 0; ii < kopho2.length; ii++) {
if (kopho1[i].equals(kopho2[ii])) {
return 1;
}
}
}
return 0;
}
@Override
public Object encode(Object str) throws EncoderException {
return encode((String) str);
}
@Override
public String encode(String str) throws EncoderException {
if (str == null) return null;
String[] s = code(str.toString());
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
sb.append(s[i]);
if (i < s.length - 1) {
sb.append('_');
}
}
return sb.toString();
}
private void init() {
this.variationsPatterns = new Pattern[getPatterns().length];
for (int i = 0; i < getPatterns().length; i++) {
this.variationsPatterns[i] = Pattern.compile(getPatterns()[i]);
}
}
private String[] code(String str) {
List<String> parts = partition(str);
String[] codes = new String[parts.size()];
int i = 0;
for (String s : parts) {
codes[i++] = substitute(s);
}
return codes;
}
private List<String> partition(String str) {
String primaryForm = str;
List<String> parts = new ArrayList();
parts.add(primaryForm.replaceAll("[^\\p{L}\\p{N}]", ""));
if (!primary) {
List<String> tmpParts = new ArrayList();
tmpParts.addAll((Arrays.asList(str.split("[\\p{Z}\\p{C}\\p{P}]"))));
int numberOfParts = tmpParts.size();
while (tmpParts.size() > 0) {
StringBuilder part = new StringBuilder();
for (int i = 0; i < tmpParts.size(); i++) {
part.append(tmpParts.get(i));
if (!(i + 1 == numberOfParts)) {
parts.add(part.toString());
}
}
tmpParts.remove(0);
}
}
List<String> variations = new ArrayList();
for (int i = 0; i < parts.size(); i++) {
List variation = getVariations(parts.get(i));
if (variation != null) {
variations.addAll(variation);
}
}
return variations;
}
private List getVariations(String str) {
int position = 0;
List<String> variations = new ArrayList();
variations.add("");
while (position < str.length()) {
int i = 0;
int substPos = -1;
while (substPos < position && i < getPatterns().length) {
Matcher m = variationsPatterns[i].matcher(str);
while (substPos < position && m.find()) {
substPos = m.start();
}
i++;
}
if (substPos >= position) {
i--;
List<String> varNew = new ArrayList();
String prevPart = str.substring(position, substPos);
for (int ii = 0; ii < variations.size(); ii++) {
String tmp = variations.get(ii);
varNew.add(tmp.concat(prevPart + getReplacements()[i]));
variations.set(ii, variations.get(ii) + prevPart + getPatterns()[i]);
}
variations.addAll(varNew);
position = substPos + getPatterns()[i].length();
} else {
for (int ii = 0; ii < variations.size(); ii++) {
variations.set(ii, variations.get(ii) + str.substring(position, str.length()));
}
position = str.length();
}
}
return variations;
}
private String substitute(String str) {
String s = expandUmlauts(str.toUpperCase(Locale.GERMAN));
s = removeSequences(s);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char current = s.charAt(i);
char next = i + 1 < s.length() ? s.charAt(i + 1) : '_';
char prev = i > 0 ? s.charAt(i - 1) : '_';
switch (current) {
case 'A':
case 'E':
case 'I':
case 'J':
case 'Y':
case 'O':
case 'U':
if (i == 0 || ((i == 1) && prev == 'H')) {
sb.append(getCode());
}
break;
case 'P':
sb.append(next == 'H' ? "33" : '1');
break;
case 'B':
sb.append('1');
break;
case 'D':
case 'T':
sb.append(csz.contains(next) ? '8' : '2');
break;
case 'F':
case 'V':
case 'W':
sb.append('3');
break;
case 'G':
case 'K':
case 'Q':
sb.append('4');
break;
case 'C':
if (i == 0) {
sb.append(ahkloqrux.contains(next) ? '4' : '8');
} else {
sb.append(aouhkxq.contains(next) ? '4' : '8');
}
if (sb.length() >= 2 && sb.charAt(sb.length() - 2) == '8') {
sb.setCharAt(sb.length() - 1, '8');
}
break;
case 'X':
sb.append(i < 1 || !ckq.contains(prev) ? "48" : '8');
break;
case 'L':
sb.append('5');
break;
case 'M':
case 'N':
sb.append('6');
break;
case 'R':
sb.append('7');
break;
case 'S':
case 'Z':
sb.append('8');
break;
case 'H':
break;
}
}
s = sb.toString();
s = removeSequences(s);
return s;
}
/**
*
* @param str
* @return
*/
private String expandUmlauts(String str) {
return str.replaceAll("\u00C4", "AE").replaceAll("\u00D6", "OE").replaceAll("\u00DC", "UE");
}
/**
*
* @param str
* @return
*/
private String removeSequences(String str) {
if (str == null || str.length() == 0) {
return "";
}
int i = 0, j = 0;
StringBuilder sb = new StringBuilder().append(str.charAt(i++));
char c;
while (i < str.length()) {
c = str.charAt(i);
if (c != sb.charAt(j)) {
sb.append(c);
j++;
}
i++;
}
return sb.toString();
}
}

View File

@ -0,0 +1,325 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elasticsearch.index.analysis.phonetic;
import java.util.regex.Pattern;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
/**
*
* Taken from commons-codec trunk (unreleased yet)
*
* Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate
* similar names, but can also be used as a general purpose scheme to find word
* with similar phonemes.
*
* <p> NYSIIS features an accuracy increase of 2.7% over the traditional Soundex
* algorithm. </p>
*
* <p>Algorithm description:
* <pre>
* 1. Transcode first characters of name
* 1a. MAC -> MCC
* 1b. KN -> NN
* 1c. K -> C
* 1d. PH -> FF
* 1e. PF -> FF
* 1f. SCH -> SSS
* 2. Transcode last characters of name
* 2a. EE, IE -> Y
* 2b. DT,RT,RD,NT,ND -> D
* 3. First character of key = first character of name
* 4. Transcode remaining characters by following these rules, incrementing by one character each time
* 4a. EV -> AF else A,E,I,O,U -> A
* 4b. Q -> G
* 4c. Z -> S
* 4d. M -> N
* 4e. KN -> N else K -> C
* 4f. SCH -> SSS
* 4g. PH -> FF
* 4h. H -> If previous or next is nonvowel, previous
* 4i. W -> If previous is vowel, previous
* 4j. Add current to key if current != last key character
* 5. If last character is S, remove it
* 6. If last characters are AY, replace with Y
* 7. If last character is A, remove it
* 8. Collapse all strings of repeated characters
* 9. Add original first character of name as first character of key
* </pre></p>
*
* @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
* @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
*
*/
public class Nysiis implements StringEncoder {
private static final char[] CHARS_A = new char[]{'A'};
private static final char[] CHARS_AF = new char[]{'A', 'F'};
private static final char[] CHARS_C = new char[]{'C'};
private static final char[] CHARS_FF = new char[]{'F', 'F'};
private static final char[] CHARS_G = new char[]{'G'};
private static final char[] CHARS_N = new char[]{'N'};
private static final char[] CHARS_NN = new char[]{'N', 'N'};
private static final char[] CHARS_S = new char[]{'S'};
private static final char[] CHARS_SSS = new char[]{'S', 'S', 'S'};
private static final Pattern PAT_MAC = Pattern.compile("^MAC");
private static final Pattern PAT_KN = Pattern.compile("^KN");
private static final Pattern PAT_K = Pattern.compile("^K");
private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
private static final Pattern PAT_SCH = Pattern.compile("^SCH");
private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
private static final char SPACE = ' ';
private static final int TRUE_LENGTH = 6;
/**
* Tests if the given character is a vowel.
*
* @param c the character to test
* @return {@code true} if the character is a vowel, {@code false} otherwise
*/
private static boolean isVowel(final char c) {
return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
}
/**
* Transcodes the remaining parts of the String. The method operates on a
* sliding window, looking at 4 characters at a time: [i-1, i, i+1, i+2].
*
* @param prev the previous character
* @param curr the current character
* @param next the next character
* @param aNext the after next character
* @return a transcoded array of characters, starting from the current
* position
*/
private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
// 1. EV -> AF
if (curr == 'E' && next == 'V') {
return CHARS_AF;
}
// A, E, I, O, U -> A
if (isVowel(curr)) {
return CHARS_A;
}
// 2. Q -> G, Z -> S, M -> N
if (curr == 'Q') {
return CHARS_G;
} else if (curr == 'Z') {
return CHARS_S;
} else if (curr == 'M') {
return CHARS_N;
}
// 3. KN -> NN else K -> C
if (curr == 'K') {
if (next == 'N') {
return CHARS_NN;
} else {
return CHARS_C;
}
}
// 4. SCH -> SSS
if (curr == 'S' && next == 'C' && aNext == 'H') {
return CHARS_SSS;
}
// PH -> FF
if (curr == 'P' && next == 'H') {
return CHARS_FF;
}
// 5. H -> If previous or next is a non vowel, previous.
if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
return new char[]{prev};
}
// 6. W -> If previous is vowel, previous.
if (curr == 'W' && isVowel(prev)) {
return new char[]{prev};
}
return new char[]{curr};
}
/**
* Indicates the strict mode.
*/
private final boolean strict;
/**
* Creates an instance of the {@link Nysiis} encoder with strict mode
* (original form), i.e. encoded strings have a maximum length of 6.
*/
public Nysiis() {
this(true);
}
/**
* Create an instance of the {@link Nysiis} encoder with the specified
* strict mode:
*
* <ul> <li>{@code true}: encoded strings have a maximum length of 6</li> <li>{@code false}:
* encoded strings may have arbitrary length</li> </ul>
*
* @param strict the strict mode
*/
public Nysiis(final boolean strict) {
this.strict = strict;
}
/**
* Encodes an Object using the NYSIIS algorithm. This method is provided in
* order to satisfy the requirements of the Encoder interface, and will
* throw an {@link EncoderException} if the supplied object is not of type
* {@link String}.
*
* @param obj Object to encode
* @return An object (or a {@link String}) containing the NYSIIS code which
* corresponds to the given String.
* @throws EncoderException if the parameter supplied is not of a {@link String}
* @throws IllegalArgumentException if a character is not mapped
*/
@Override
public Object encode(Object obj) throws EncoderException {
if (!(obj instanceof String)) {
throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
}
return this.nysiis((String) obj);
}
/**
* Encodes a String using the NYSIIS algorithm.
*
* @param str A String object to encode
* @return A Nysiis code corresponding to the String supplied
* @throws IllegalArgumentException if a character is not mapped
*/
@Override
public String encode(String str) {
return this.nysiis(str);
}
/**
* Indicates the strict mode for this {@link Nysiis} encoder.
*
* @return {@code true} if the encoder is configured for strict mode, {@code false}
* otherwise
*/
public boolean isStrict() {
return this.strict;
}
/**
* Retrieves the NYSIIS code for a given String object.
*
* @param str String to encode using the NYSIIS algorithm
* @return A NYSIIS code for the String supplied
*/
public String nysiis(String str) {
if (str == null) {
return null;
}
// Use the same clean rules as Soundex
str = clean(str);
if (str.length() == 0) {
return str;
}
// Translate first characters of name:
// MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
str = PAT_MAC.matcher(str).replaceFirst("MCC");
str = PAT_KN.matcher(str).replaceFirst("NN");
str = PAT_K.matcher(str).replaceFirst("C");
str = PAT_PH_PF.matcher(str).replaceFirst("FF");
str = PAT_SCH.matcher(str).replaceFirst("SSS");
// Translate last characters of name:
// EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
str = PAT_EE_IE.matcher(str).replaceFirst("Y");
str = PAT_DT_ETC.matcher(str).replaceFirst("D");
// First character of key = first character of name.
StringBuffer key = new StringBuffer(str.length());
key.append(str.charAt(0));
// Transcode remaining characters, incrementing by one character each time
final char[] chars = str.toCharArray();
final int len = chars.length;
for (int i = 1; i < len; i++) {
final char next = i < len - 1 ? chars[i + 1] : SPACE;
final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
System.arraycopy(transcoded, 0, chars, i, transcoded.length);
// only append the current char to the key if it is different from the last one
if (chars[i] != chars[i - 1]) {
key.append(chars[i]);
}
}
if (key.length() > 1) {
char lastChar = key.charAt(key.length() - 1);
// If last character is S, remove it.
if (lastChar == 'S') {
key.deleteCharAt(key.length() - 1);
lastChar = key.charAt(key.length() - 1);
}
if (key.length() > 2) {
final char last2Char = key.charAt(key.length() - 2);
// If last characters are AY, replace with Y.
if (last2Char == 'A' && lastChar == 'Y') {
key.deleteCharAt(key.length() - 2);
}
}
// If last character is A, remove it.
if (lastChar == 'A') {
key.deleteCharAt(key.length() - 1);
}
}
final String string = key.toString();
return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
}
static String clean(String str) {
if (str == null || str.length() == 0) {
return str;
}
int len = str.length();
char[] chars = new char[len];
int count = 0;
for (int i = 0; i < len; i++) {
if (Character.isLetter(str.charAt(i))) {
chars[count++] = str.charAt(i);
}
}
if (count == len) {
return str.toUpperCase(java.util.Locale.ENGLISH);
}
return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
}
}

View File

@ -2,6 +2,8 @@ package org.elasticsearch.index.analysis;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import static org.elasticsearch.common.settings.ImmutableSettings.*;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
@ -10,29 +12,38 @@ import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.hamcrest.MatcherAssert;
import static org.hamcrest.Matchers.*;
import org.testng.annotations.Test;
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
/**
*/
public class SimplePhoneticAnalysisTests {
@Test
public void testDefaultsIcuAnalysis() {
public void testPhoneticTokenFilterFactory() {
Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml").build();
AnalysisService analysisService = testSimpleConfiguration(settings);
TokenFilterFactory standardfilterFactory = analysisService.tokenFilter("standard");
System.err.println("standard filterfactory = " + standardfilterFactory);
TokenFilterFactory filterFactory = analysisService.tokenFilter("phonetic");
System.err.println("filterfactory = " + filterFactory);
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
}
private AnalysisService testSimpleConfiguration(Settings settings) {
Index index = new Index("test");
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector();
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
new EnvironmentModule(new Environment(settings)),
new IndicesAnalysisModule()).createInjector();
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(index, EMPTY_SETTINGS),
new IndexSettingsModule(index, settings),
new IndexNameModule(index),
new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new PhoneticAnalysisBinderProcessor()))
.createChildInjector(parentInjector);
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))
.addProcessor(new PhoneticAnalysisBinderProcessor())).createChildInjector(parentInjector);
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
// need to create one with encoder...
//TokenFilterFactory tokenFilterFactory = analysisService.tokenFilter("phonetic");
//MatcherAssert.assertThat(tokenFilterFactory, Matchers.instanceOf(PhoneticTokenFilterFactory.class));
return analysisService;
}
}

View File

@ -0,0 +1,30 @@
index:
analysis:
filter:
doublemetaphonefilter:
type: phonetic
encoder: doublemetaphone
metaphonefilter:
type: phonetic
encoder: metaphone
soundexfilter:
type: phonetic
encoder: soundex
refinedsoundexfilter:
type: phonetic
encoder: refinedsoundex
caverphonefilter:
type: phonetic
encoder: caverphone
beidermorsefilter:
type: phonetic
encoder: beidermorse
koelnerphonetikfilter:
type: phonetic
encoder: koelnerphonetik
haasephonetikfilter:
type: phonetic
encoder: haasephonetik
nysiisfilter:
type: phonetic
encoder: nysiis