migrate branch for analysis-phonetic

This commit is contained in:
Simon Willnauer 2015-06-05 13:12:21 +02:00
commit 0d328b07bd
13 changed files with 1301 additions and 0 deletions

View File

@ -0,0 +1,93 @@
Phonetic Analysis for Elasticsearch
===================================
The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch.
In order to install the plugin, simply run:
```sh
bin/plugin install elasticsearch/elasticsearch-analysis-phonetic/2.5.0
```
| elasticsearch |Phonetic Analysis Plugin| Docs |
|---------------|-----------------------|------------|
| master | Build from source | See below |
| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) |
| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.5.0/#version-250-for-elasticsearch-15) |
| es-1.4 | 2.4.3 | [2.4.3](https://github.com/elasticsearch/elasticsearch-analysis-phonetic/tree/v2.4.3/#version-243-for-elasticsearch-14) |
| < 1.4.5 | 2.4.2 | [2.4.2](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.2/#version-242-for-elasticsearch-14) |
| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.1/#version-241-for-elasticsearch-14) |
| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.3.0/#phonetic-analysis-for-elasticsearch) |
| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.2.0/#phonetic-analysis-for-elasticsearch) |
| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.1.0/#phonetic-analysis-for-elasticsearch) |
| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.0.0/#phonetic-analysis-for-elasticsearch) |
| es-0.90 | 1.8.0 | [1.8.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v1.8.0/#phonetic-analysis-for-elasticsearch) |
To build a `SNAPSHOT` version, you need to build it with Maven:
```bash
mvn clean install
plugin --install analysis-phonetic \
--url file:target/releases/elasticsearch-analysis-phonetic-X.X.X-SNAPSHOT.zip
```
## User guide
A `phonetic` token filter that can be configured with different `encoder` types:
`metaphone`, `doublemetaphone`, `soundex`, `refinedsoundex`,
`caverphone1`, `caverphone2`, `cologne`, `nysiis`,
`koelnerphonetik`, `haasephonetik`, `beidermorse`
The `replace` parameter (defaults to `true`) controls if the token processed
should be replaced with the encoded one (set it to `true`), or added (set it to `false`).
```js
{
"index" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "my_metaphone"]
}
},
"filter" : {
"my_metaphone" : {
"type" : "phonetic",
"encoder" : "metaphone",
"replace" : false
}
}
}
}
}
```
Note that `beidermorse` does not support `replace` parameter.
Questions
---------
If you have questions or comments please use the [mailing list](https://groups.google.com/group/elasticsearch) instead
of Github Issues tracker.
License
-------
This software is licensed under the Apache 2 license, quoted below.
Copyright 2009-2014 Elasticsearch <http://www.elasticsearch.org>
Licensed under the Apache License, Version 2.0 (the "License"); you may not
use this file except in compliance with the License. You may obtain a copy of
the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and limitations under
the License.

View File

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch.plugin</groupId>
<artifactId>elasticsearch-analysis-phonetic</artifactId>
<packaging>jar</packaging>
<name>Elasticsearch Phonetic Analysis plugin</name>
<description>The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch.</description>
<parent>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-plugin</artifactId>
<version>2.0.0-SNAPSHOT</version>
</parent>
<properties>
<!-- You can add any specific project property here -->
</properties>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-phonetic</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,26 @@
<?xml version="1.0"?>
<assembly>
<id>plugin</id>
<formats>
<format>zip</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<outputDirectory>/</outputDirectory>
<useProjectArtifact>true</useProjectArtifact>
<useTransitiveFiltering>true</useTransitiveFiltering>
<excludes>
<exclude>org.elasticsearch:elasticsearch</exclude>
</excludes>
</dependencySet>
<dependencySet>
<outputDirectory>/</outputDirectory>
<useProjectArtifact>true</useProjectArtifact>
<useTransitiveFiltering>true</useTransitiveFiltering>
<includes>
<include>org.apache.lucene:lucene-analyzers-phonetic</include>
</includes>
</dependencySet>
</dependencySets>
</assembly>

View File

@ -0,0 +1,30 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
/**
*/
public class PhoneticAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
@Override
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
}
}

View File

@ -0,0 +1,131 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.*;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.NameType;
import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.commons.codec.language.bm.RuleType;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.phonetic.HaasePhonetik;
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
import org.elasticsearch.index.analysis.phonetic.Nysiis;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Arrays;
import java.util.HashSet;
/**
*
*/
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
private final Encoder encoder;
private final boolean replace;
private int maxcodelength;
private String[] languageset;
private NameType nametype;
private RuleType ruletype;
@Inject
public PhoneticTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
this.languageset = null;
this.nametype = null;
this.ruletype = null;
this.maxcodelength = 0;
this.replace = settings.getAsBoolean("replace", true);
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
String encodername = settings.get("encoder", "metaphone");
if ("metaphone".equalsIgnoreCase(encodername)) {
this.encoder = new Metaphone();
} else if ("soundex".equalsIgnoreCase(encodername)) {
this.encoder = new Soundex();
} else if ("caverphone1".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone1();
} else if ("caverphone2".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
} else if ("caverphone".equalsIgnoreCase(encodername)) {
this.encoder = new Caverphone2();
} else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) {
this.encoder = new RefinedSoundex();
} else if ("cologne".equalsIgnoreCase(encodername)) {
this.encoder = new ColognePhonetic();
} else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) {
this.encoder = null;
this.maxcodelength = settings.getAsInt("max_code_len", 4);
} else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) {
this.encoder = null;
this.languageset = settings.getAsArray("languageset");
String ruleType = settings.get("rule_type", "approx");
if ("approx".equalsIgnoreCase(ruleType)) {
ruletype = RuleType.APPROX;
} else if ("exact".equalsIgnoreCase(ruleType)) {
ruletype = RuleType.EXACT;
} else {
throw new IllegalArgumentException("No matching rule type [" + ruleType + "] for beider morse encoder");
}
String nameType = settings.get("name_type", "generic");
if ("GENERIC".equalsIgnoreCase(nameType)) {
nametype = NameType.GENERIC;
} else if ("ASHKENAZI".equalsIgnoreCase(nameType)) {
nametype = NameType.ASHKENAZI;
} else if ("SEPHARDIC".equalsIgnoreCase(nameType)) {
nametype = NameType.SEPHARDIC;
}
} else if ("koelnerphonetik".equalsIgnoreCase(encodername)) {
this.encoder = new KoelnerPhonetik();
} else if ("haasephonetik".equalsIgnoreCase(encodername)) {
this.encoder = new HaasePhonetik();
} else if ("nysiis".equalsIgnoreCase(encodername)) {
this.encoder = new Nysiis();
} else {
throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (encoder == null) {
if (ruletype != null && nametype != null) {
if (languageset != null) {
final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset)));
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
}
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
}
if (maxcodelength > 0) {
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
}
} else {
return new PhoneticFilter(tokenStream, encoder, !replace);
}
throw new IllegalArgumentException("encoder error");
}
}

View File

@ -0,0 +1,71 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.phonetic;
/**
* Ge&auml;nderter Algorithmus aus der Matching Toolbox von Rainer Schnell
* Java-Programmierung von J&ouml;rg Reiher
*
* Die Kölner Phonetik wurde für den Einsatz in Namensdatenbanken wie
* der Verwaltung eines Krankenhauses durch Martin Haase (Institut für
* Sprachwissenschaft, Universität zu Köln) und Kai Heitmann (Insitut für
* medizinische Statistik, Informatik und Epidemiologie, Köln) überarbeitet.
* M. Haase und K. Heitmann. Die Erweiterte Kölner Phonetik. 526, 2000.
*
* nach: Martin Wilz, Aspekte der Kodierung phonetischer Ähnlichkeiten
* in deutschen Eigennamen, Magisterarbeit.
* http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
*
* @author <a href="mailto:joergprante@gmail.com">J&ouml;rg Prante</a>
*/
public class HaasePhonetik extends KoelnerPhonetik {
private final static String[] HAASE_VARIATIONS_PATTERNS = {"OWN", "RB", "WSK", "A$", "O$", "SCH",
"GLI", "EAU$", "^CH", "AUX", "EUX", "ILLE"};
private final static String[] HAASE_VARIATIONS_REPLACEMENTS = {"AUN", "RW", "RSK", "AR", "OW", "CH",
"LI", "O", "SCH", "O", "O", "I"};
/**
*
* @return
*/
@Override
protected String[] getPatterns() {
return HAASE_VARIATIONS_PATTERNS;
}
/**
*
* @return
*/
@Override
protected String[] getReplacements() {
return HAASE_VARIATIONS_REPLACEMENTS;
}
/**
*
* @return
*/
@Override
protected char getCode() {
return '9';
}
}

View File

@ -0,0 +1,324 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.phonetic;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* K&ouml;lner Phonetik
*
* H.J. Postel, Die K&ouml;lner Phonetik. Ein Verfahren zu Identifizierung
* von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten 19 (1969), 925-931
*
* Algorithmus aus der Matching Toolbox von Rainer Schnell
* Java-Programmierung von J&ouml;rg Reiher
*
* mit &Auml;nderungen von Jörg Prante
*
*/
public class KoelnerPhonetik implements StringEncoder {
private static final String[] POSTEL_VARIATIONS_PATTERNS = {"AUN", "OWN", "RB", "RW", "WSK", "RSK"};
private static final String[] POSTEL_VARIATIONS_REPLACEMENTS = {"OWN", "AUN", "RW", "RB", "RSK", "WSK"};
private Pattern[] variationsPatterns;
private boolean primary = false;
private final Set<Character> csz = new HashSet(Arrays.asList(
'C', 'S', 'Z'));
private final Set<Character> ckq = new HashSet(Arrays.asList(
'C', 'K', 'Q'));
private final Set<Character> aouhkxq = new HashSet(Arrays.asList(
'A', 'O', 'U', 'H', 'K', 'X', 'Q'));
private final Set<Character> ahkloqrux = new HashSet(Arrays.asList(
'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'));
/**
* Constructor for Kölner Phonetik
*/
public KoelnerPhonetik() {
init();
}
/**
*
* @param useOnlyPrimaryCode
*/
public KoelnerPhonetik(boolean useOnlyPrimaryCode) {
this();
this.primary = useOnlyPrimaryCode;
}
/**
* Get variation patterns
*
* @return string array of variations
*/
protected String[] getPatterns() {
return POSTEL_VARIATIONS_PATTERNS;
}
/**
*
* @return
*/
protected String[] getReplacements() {
return POSTEL_VARIATIONS_REPLACEMENTS;
}
/**
*
* @return
*/
protected char getCode() {
return '0';
}
/**
*
* @param o1
* @param o2
* @return
*/
public double getRelativeValue(Object o1, Object o2) {
String[] kopho1 = code(expandUmlauts(o1.toString().toUpperCase(Locale.GERMANY)));
String[] kopho2 = code(expandUmlauts(o2.toString().toUpperCase(Locale.GERMANY)));
for (int i = 0; i < kopho1.length; i++) {
for (int ii = 0; ii < kopho2.length; ii++) {
if (kopho1[i].equals(kopho2[ii])) {
return 1;
}
}
}
return 0;
}
@Override
public Object encode(Object str) throws EncoderException {
return encode((String) str);
}
@Override
public String encode(String str) throws EncoderException {
if (str == null) return null;
String[] s = code(str.toString());
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length; i++) {
sb.append(s[i]);
if (i < s.length - 1) {
sb.append('_');
}
}
return sb.toString();
}
private void init() {
this.variationsPatterns = new Pattern[getPatterns().length];
for (int i = 0; i < getPatterns().length; i++) {
this.variationsPatterns[i] = Pattern.compile(getPatterns()[i]);
}
}
private String[] code(String str) {
List<String> parts = partition(str);
String[] codes = new String[parts.size()];
int i = 0;
for (String s : parts) {
codes[i++] = substitute(s);
}
return codes;
}
private List<String> partition(String str) {
String primaryForm = str;
List<String> parts = new ArrayList();
parts.add(primaryForm.replaceAll("[^\\p{L}\\p{N}]", ""));
if (!primary) {
List<String> tmpParts = new ArrayList();
tmpParts.addAll((Arrays.asList(str.split("[\\p{Z}\\p{C}\\p{P}]"))));
int numberOfParts = tmpParts.size();
while (tmpParts.size() > 0) {
StringBuilder part = new StringBuilder();
for (int i = 0; i < tmpParts.size(); i++) {
part.append(tmpParts.get(i));
if (!(i + 1 == numberOfParts)) {
parts.add(part.toString());
}
}
tmpParts.remove(0);
}
}
List<String> variations = new ArrayList();
for (int i = 0; i < parts.size(); i++) {
List variation = getVariations(parts.get(i));
if (variation != null) {
variations.addAll(variation);
}
}
return variations;
}
private List getVariations(String str) {
int position = 0;
List<String> variations = new ArrayList();
variations.add("");
while (position < str.length()) {
int i = 0;
int substPos = -1;
while (substPos < position && i < getPatterns().length) {
Matcher m = variationsPatterns[i].matcher(str);
while (substPos < position && m.find()) {
substPos = m.start();
}
i++;
}
if (substPos >= position) {
i--;
List<String> varNew = new ArrayList();
String prevPart = str.substring(position, substPos);
for (int ii = 0; ii < variations.size(); ii++) {
String tmp = variations.get(ii);
varNew.add(tmp.concat(prevPart + getReplacements()[i]));
variations.set(ii, variations.get(ii) + prevPart + getPatterns()[i]);
}
variations.addAll(varNew);
position = substPos + getPatterns()[i].length();
} else {
for (int ii = 0; ii < variations.size(); ii++) {
variations.set(ii, variations.get(ii) + str.substring(position, str.length()));
}
position = str.length();
}
}
return variations;
}
private String substitute(String str) {
String s = expandUmlauts(str.toUpperCase(Locale.GERMAN));
s = removeSequences(s);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char current = s.charAt(i);
char next = i + 1 < s.length() ? s.charAt(i + 1) : '_';
char prev = i > 0 ? s.charAt(i - 1) : '_';
switch (current) {
case 'A':
case 'E':
case 'I':
case 'J':
case 'Y':
case 'O':
case 'U':
if (i == 0 || ((i == 1) && prev == 'H')) {
sb.append(getCode());
}
break;
case 'P':
sb.append(next == 'H' ? "33" : '1');
break;
case 'B':
sb.append('1');
break;
case 'D':
case 'T':
sb.append(csz.contains(next) ? '8' : '2');
break;
case 'F':
case 'V':
case 'W':
sb.append('3');
break;
case 'G':
case 'K':
case 'Q':
sb.append('4');
break;
case 'C':
if (i == 0) {
sb.append(ahkloqrux.contains(next) ? '4' : '8');
} else {
sb.append(aouhkxq.contains(next) ? '4' : '8');
}
if (sb.length() >= 2 && sb.charAt(sb.length() - 2) == '8') {
sb.setCharAt(sb.length() - 1, '8');
}
break;
case 'X':
sb.append(i < 1 || !ckq.contains(prev) ? "48" : '8');
break;
case 'L':
sb.append('5');
break;
case 'M':
case 'N':
sb.append('6');
break;
case 'R':
sb.append('7');
break;
case 'S':
case 'Z':
sb.append('8');
break;
case 'H':
break;
}
}
s = sb.toString();
s = removeSequences(s);
return s;
}
/**
*
* @param str
* @return
*/
private String expandUmlauts(String str) {
return str.replaceAll("\u00C4", "AE").replaceAll("\u00D6", "OE").replaceAll("\u00DC", "UE");
}
/**
*
* @param str
* @return
*/
private String removeSequences(String str) {
if (str == null || str.length() == 0) {
return "";
}
int i = 0, j = 0;
StringBuilder sb = new StringBuilder().append(str.charAt(i++));
char c;
while (i < str.length()) {
c = str.charAt(i);
if (c != sb.charAt(j)) {
sb.append(c);
j++;
}
i++;
}
return sb.toString();
}
}

View File

@ -0,0 +1,329 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.phonetic;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
import java.util.regex.Pattern;
/**
*
* Taken from commons-codec trunk (unreleased yet)
*
* Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate
* similar names, but can also be used as a general purpose scheme to find word
* with similar phonemes.
*
* <p> NYSIIS features an accuracy increase of 2.7% over the traditional Soundex
* algorithm. </p>
*
* <p>Algorithm description:
* <pre>
* 1. Transcode first characters of name
* 1a. MAC -> MCC
* 1b. KN -> NN
* 1c. K -> C
* 1d. PH -> FF
* 1e. PF -> FF
* 1f. SCH -> SSS
* 2. Transcode last characters of name
* 2a. EE, IE -> Y
* 2b. DT,RT,RD,NT,ND -> D
* 3. First character of key = first character of name
* 4. Transcode remaining characters by following these rules, incrementing by one character each time
* 4a. EV -> AF else A,E,I,O,U -> A
* 4b. Q -> G
* 4c. Z -> S
* 4d. M -> N
* 4e. KN -> N else K -> C
* 4f. SCH -> SSS
* 4g. PH -> FF
* 4h. H -> If previous or next is nonvowel, previous
* 4i. W -> If previous is vowel, previous
* 4j. Add current to key if current != last key character
* 5. If last character is S, remove it
* 6. If last characters are AY, replace with Y
* 7. If last character is A, remove it
* 8. Collapse all strings of repeated characters
* 9. Add original first character of name as first character of key
* </pre></p>
*
* @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
* @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
*
*/
public class Nysiis implements StringEncoder {
private static final char[] CHARS_A = new char[]{'A'};
private static final char[] CHARS_AF = new char[]{'A', 'F'};
private static final char[] CHARS_C = new char[]{'C'};
private static final char[] CHARS_FF = new char[]{'F', 'F'};
private static final char[] CHARS_G = new char[]{'G'};
private static final char[] CHARS_N = new char[]{'N'};
private static final char[] CHARS_NN = new char[]{'N', 'N'};
private static final char[] CHARS_S = new char[]{'S'};
private static final char[] CHARS_SSS = new char[]{'S', 'S', 'S'};
private static final Pattern PAT_MAC = Pattern.compile("^MAC");
private static final Pattern PAT_KN = Pattern.compile("^KN");
private static final Pattern PAT_K = Pattern.compile("^K");
private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
private static final Pattern PAT_SCH = Pattern.compile("^SCH");
private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
private static final char SPACE = ' ';
private static final int TRUE_LENGTH = 6;
/**
* Tests if the given character is a vowel.
*
* @param c the character to test
* @return {@code true} if the character is a vowel, {@code false} otherwise
*/
private static boolean isVowel(final char c) {
return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
}
/**
* Transcodes the remaining parts of the String. The method operates on a
* sliding window, looking at 4 characters at a time: [i-1, i, i+1, i+2].
*
* @param prev the previous character
* @param curr the current character
* @param next the next character
* @param aNext the after next character
* @return a transcoded array of characters, starting from the current
* position
*/
private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
// 1. EV -> AF
if (curr == 'E' && next == 'V') {
return CHARS_AF;
}
// A, E, I, O, U -> A
if (isVowel(curr)) {
return CHARS_A;
}
// 2. Q -> G, Z -> S, M -> N
if (curr == 'Q') {
return CHARS_G;
} else if (curr == 'Z') {
return CHARS_S;
} else if (curr == 'M') {
return CHARS_N;
}
// 3. KN -> NN else K -> C
if (curr == 'K') {
if (next == 'N') {
return CHARS_NN;
} else {
return CHARS_C;
}
}
// 4. SCH -> SSS
if (curr == 'S' && next == 'C' && aNext == 'H') {
return CHARS_SSS;
}
// PH -> FF
if (curr == 'P' && next == 'H') {
return CHARS_FF;
}
// 5. H -> If previous or next is a non vowel, previous.
if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
return new char[]{prev};
}
// 6. W -> If previous is vowel, previous.
if (curr == 'W' && isVowel(prev)) {
return new char[]{prev};
}
return new char[]{curr};
}
/**
* Indicates the strict mode.
*/
private final boolean strict;
/**
* Creates an instance of the {@link Nysiis} encoder with strict mode
* (original form), i.e. encoded strings have a maximum length of 6.
*/
public Nysiis() {
this(true);
}
/**
* Create an instance of the {@link Nysiis} encoder with the specified
* strict mode:
*
* <ul> <li>{@code true}: encoded strings have a maximum length of 6</li> <li>{@code false}:
* encoded strings may have arbitrary length</li> </ul>
*
* @param strict the strict mode
*/
public Nysiis(final boolean strict) {
this.strict = strict;
}
/**
* Encodes an Object using the NYSIIS algorithm. This method is provided in
* order to satisfy the requirements of the Encoder interface, and will
* throw an {@link EncoderException} if the supplied object is not of type
* {@link String}.
*
* @param obj Object to encode
* @return An object (or a {@link String}) containing the NYSIIS code which
* corresponds to the given String.
* @throws EncoderException if the parameter supplied is not of a {@link String}
* @throws IllegalArgumentException if a character is not mapped
*/
@Override
public Object encode(Object obj) throws EncoderException {
if (!(obj instanceof String)) {
throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
}
return this.nysiis((String) obj);
}
/**
* Encodes a String using the NYSIIS algorithm.
*
* @param str A String object to encode
* @return A Nysiis code corresponding to the String supplied
* @throws IllegalArgumentException if a character is not mapped
*/
@Override
public String encode(String str) {
return this.nysiis(str);
}
/**
* Indicates the strict mode for this {@link Nysiis} encoder.
*
* @return {@code true} if the encoder is configured for strict mode, {@code false}
* otherwise
*/
public boolean isStrict() {
return this.strict;
}
/**
* Retrieves the NYSIIS code for a given String object.
*
* @param str String to encode using the NYSIIS algorithm
* @return A NYSIIS code for the String supplied
*/
public String nysiis(String str) {
if (str == null) {
return null;
}
// Use the same clean rules as Soundex
str = clean(str);
if (str.length() == 0) {
return str;
}
// Translate first characters of name:
// MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
str = PAT_MAC.matcher(str).replaceFirst("MCC");
str = PAT_KN.matcher(str).replaceFirst("NN");
str = PAT_K.matcher(str).replaceFirst("C");
str = PAT_PH_PF.matcher(str).replaceFirst("FF");
str = PAT_SCH.matcher(str).replaceFirst("SSS");
// Translate last characters of name:
// EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
str = PAT_EE_IE.matcher(str).replaceFirst("Y");
str = PAT_DT_ETC.matcher(str).replaceFirst("D");
// First character of key = first character of name.
StringBuffer key = new StringBuffer(str.length());
key.append(str.charAt(0));
// Transcode remaining characters, incrementing by one character each time
final char[] chars = str.toCharArray();
final int len = chars.length;
for (int i = 1; i < len; i++) {
final char next = i < len - 1 ? chars[i + 1] : SPACE;
final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
System.arraycopy(transcoded, 0, chars, i, transcoded.length);
// only append the current char to the key if it is different from the last one
if (chars[i] != chars[i - 1]) {
key.append(chars[i]);
}
}
if (key.length() > 1) {
char lastChar = key.charAt(key.length() - 1);
// If last character is S, remove it.
if (lastChar == 'S') {
key.deleteCharAt(key.length() - 1);
lastChar = key.charAt(key.length() - 1);
}
if (key.length() > 2) {
final char last2Char = key.charAt(key.length() - 2);
// If last characters are AY, replace with Y.
if (last2Char == 'A' && lastChar == 'Y') {
key.deleteCharAt(key.length() - 2);
}
}
// If last character is A, remove it.
if (lastChar == 'A') {
key.deleteCharAt(key.length() - 1);
}
}
final String string = key.toString();
return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
}
static String clean(String str) {
if (str == null || str.length() == 0) {
return str;
}
int len = str.length();
char[] chars = new char[len];
int count = 0;
for (int i = 0; i < len; i++) {
if (Character.isLetter(str.charAt(i))) {
chars[count++] = str.charAt(i);
}
}
if (count == len) {
return str.toUpperCase(java.util.Locale.ENGLISH);
}
return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
}
}

View File

@ -0,0 +1,44 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.plugin.analysis;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.PhoneticAnalysisBinderProcessor;
import org.elasticsearch.plugins.AbstractPlugin;
/**
*/
public class AnalysisPhoneticPlugin extends AbstractPlugin {
@Override
public String name() {
return "analysis-phonetic";
}
@Override
public String description() {
return "Phonetic analysis support";
}
public void onModule(AnalysisModule module) {
module.addProcessor(new PhoneticAnalysisBinderProcessor());
}
}

View File

@ -0,0 +1,3 @@
plugin=org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin
version=${project.version}
lucene=${lucene.version}

View File

@ -0,0 +1,72 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.hamcrest.MatcherAssert;
import org.junit.Test;
import static org.elasticsearch.common.settings.Settings.settingsBuilder;
import static org.hamcrest.Matchers.instanceOf;
/**
*/
public class SimplePhoneticAnalysisTests extends ElasticsearchTestCase {
@Test
public void testPhoneticTokenFilterFactory() {
Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml")
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir())
.build();
AnalysisService analysisService = testSimpleConfiguration(settings);
TokenFilterFactory filterFactory = analysisService.tokenFilter("phonetic");
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
}
private AnalysisService testSimpleConfiguration(Settings settings) {
Index index = new Index("test");
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
new EnvironmentModule(new Environment(settings)),
new IndicesAnalysisModule()).createInjector();
Injector injector = new ModulesBuilder().add(
new IndexSettingsModule(index, settings),
new IndexNameModule(index),
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))
.addProcessor(new PhoneticAnalysisBinderProcessor())).createChildInjector(parentInjector);
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
return analysisService;
}
}

View File

@ -0,0 +1,108 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.plugins.PluginsService;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.junit.Test;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.notNullValue;
@ElasticsearchIntegrationTest.ClusterScope(numDataNodes = 1, scope = ElasticsearchIntegrationTest.Scope.SUITE)
public class SimplePhoneticIntegrationTests extends ElasticsearchIntegrationTest {
@Override
protected Settings nodeSettings(int nodeOrdinal) {
return Settings.builder()
.put(super.nodeSettings(nodeOrdinal))
.put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true)
.build();
}
@Override
public Settings indexSettings() {
Settings settings = Settings.builder()
.put(super.indexSettings())
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_metaphone")
.put("index.analysis.filter.my_metaphone.type", "phonetic")
.put("index.analysis.filter.my_metaphone.encoder", "metaphone")
.put("index.analysis.filter.my_metaphone.replace", false)
.build();
return settings;
}
@Test
public void testPhoneticAnalyzer() throws ExecutionException, InterruptedException {
createIndex("test");
ensureGreen("test");
AnalyzeResponse response = client().admin().indices()
.prepareAnalyze("hello world")
.setIndex("test")
.setAnalyzer("my_analyzer")
.execute().get();
assertThat(response, notNullValue());
assertThat(response.getTokens().size(), is(4));
assertThat(response.getTokens().get(0).getTerm(), is("HL"));
assertThat(response.getTokens().get(1).getTerm(), is("hello"));
assertThat(response.getTokens().get(2).getTerm(), is("WRLT"));
assertThat(response.getTokens().get(3).getTerm(), is("world"));
}
@Test
public void testPhoneticAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException {
createIndex("test");
ensureGreen("test");
final XContentBuilder mapping = jsonBuilder().startObject()
.startObject("type")
.startObject("properties")
.startObject("foo")
.field("type", "string")
.field("analyzer", "my_analyzer")
.endObject()
.endObject()
.endObject()
.endObject();
client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get();
index("test", "type", "1", "foo", "hello world");
refresh();
SearchResponse response = client().prepareSearch("test").setQuery(
QueryBuilders.matchQuery("foo", "helllo")
).execute().actionGet();
assertThat(response.getHits().getTotalHits(), is(1L));
}
}

View File

@ -0,0 +1,30 @@
index:
analysis:
filter:
doublemetaphonefilter:
type: phonetic
encoder: doublemetaphone
metaphonefilter:
type: phonetic
encoder: metaphone
soundexfilter:
type: phonetic
encoder: soundex
refinedsoundexfilter:
type: phonetic
encoder: refinedsoundex
caverphonefilter:
type: phonetic
encoder: caverphone
beidermorsefilter:
type: phonetic
encoder: beidermorse
koelnerphonetikfilter:
type: phonetic
encoder: koelnerphonetik
haasephonetikfilter:
type: phonetic
encoder: haasephonetik
nysiisfilter:
type: phonetic
encoder: nysiis