migrate branch for analysis-phonetic
This commit is contained in:
commit
0d328b07bd
|
@ -0,0 +1,93 @@
|
||||||
|
Phonetic Analysis for Elasticsearch
|
||||||
|
===================================
|
||||||
|
|
||||||
|
The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch.
|
||||||
|
|
||||||
|
In order to install the plugin, simply run:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
bin/plugin install elasticsearch/elasticsearch-analysis-phonetic/2.5.0
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
| elasticsearch |Phonetic Analysis Plugin| Docs |
|
||||||
|
|---------------|-----------------------|------------|
|
||||||
|
| master | Build from source | See below |
|
||||||
|
| es-1.x | Build from source | [2.6.0-SNAPSHOT](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/es-1.x/#version-260-snapshot-for-elasticsearch-1x) |
|
||||||
|
| es-1.5 | 2.5.0 | [2.5.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.5.0/#version-250-for-elasticsearch-15) |
|
||||||
|
| es-1.4 | 2.4.3 | [2.4.3](https://github.com/elasticsearch/elasticsearch-analysis-phonetic/tree/v2.4.3/#version-243-for-elasticsearch-14) |
|
||||||
|
| < 1.4.5 | 2.4.2 | [2.4.2](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.2/#version-242-for-elasticsearch-14) |
|
||||||
|
| < 1.4.3 | 2.4.1 | [2.4.1](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.4.1/#version-241-for-elasticsearch-14) |
|
||||||
|
| es-1.3 | 2.3.0 | [2.3.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.3.0/#phonetic-analysis-for-elasticsearch) |
|
||||||
|
| es-1.2 | 2.2.0 | [2.2.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.2.0/#phonetic-analysis-for-elasticsearch) |
|
||||||
|
| es-1.1 | 2.1.0 | [2.1.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.1.0/#phonetic-analysis-for-elasticsearch) |
|
||||||
|
| es-1.0 | 2.0.0 | [2.0.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v2.0.0/#phonetic-analysis-for-elasticsearch) |
|
||||||
|
| es-0.90 | 1.8.0 | [1.8.0](https://github.com/elastic/elasticsearch-analysis-phonetic/tree/v1.8.0/#phonetic-analysis-for-elasticsearch) |
|
||||||
|
|
||||||
|
To build a `SNAPSHOT` version, you need to build it with Maven:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mvn clean install
|
||||||
|
plugin --install analysis-phonetic \
|
||||||
|
--url file:target/releases/elasticsearch-analysis-phonetic-X.X.X-SNAPSHOT.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
## User guide
|
||||||
|
|
||||||
|
A `phonetic` token filter that can be configured with different `encoder` types:
|
||||||
|
`metaphone`, `doublemetaphone`, `soundex`, `refinedsoundex`,
|
||||||
|
`caverphone1`, `caverphone2`, `cologne`, `nysiis`,
|
||||||
|
`koelnerphonetik`, `haasephonetik`, `beidermorse`
|
||||||
|
|
||||||
|
The `replace` parameter (defaults to `true`) controls if the token processed
|
||||||
|
should be replaced with the encoded one (set it to `true`), or added (set it to `false`).
|
||||||
|
|
||||||
|
```js
|
||||||
|
{
|
||||||
|
"index" : {
|
||||||
|
"analysis" : {
|
||||||
|
"analyzer" : {
|
||||||
|
"my_analyzer" : {
|
||||||
|
"tokenizer" : "standard",
|
||||||
|
"filter" : ["standard", "lowercase", "my_metaphone"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"filter" : {
|
||||||
|
"my_metaphone" : {
|
||||||
|
"type" : "phonetic",
|
||||||
|
"encoder" : "metaphone",
|
||||||
|
"replace" : false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that `beidermorse` does not support `replace` parameter.
|
||||||
|
|
||||||
|
|
||||||
|
Questions
|
||||||
|
---------
|
||||||
|
|
||||||
|
If you have questions or comments please use the [mailing list](https://groups.google.com/group/elasticsearch) instead
|
||||||
|
of Github Issues tracker.
|
||||||
|
|
||||||
|
License
|
||||||
|
-------
|
||||||
|
|
||||||
|
This software is licensed under the Apache 2 license, quoted below.
|
||||||
|
|
||||||
|
Copyright 2009-2014 Elasticsearch <http://www.elasticsearch.org>
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||||
|
use this file except in compliance with the License. You may obtain a copy of
|
||||||
|
the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific language governing permissions and limitations under
|
||||||
|
the License.
|
|
@ -0,0 +1,40 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<groupId>org.elasticsearch.plugin</groupId>
|
||||||
|
<artifactId>elasticsearch-analysis-phonetic</artifactId>
|
||||||
|
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
<name>Elasticsearch Phonetic Analysis plugin</name>
|
||||||
|
<description>The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch.</description>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>org.elasticsearch</groupId>
|
||||||
|
<artifactId>elasticsearch-plugin</artifactId>
|
||||||
|
<version>2.0.0-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<!-- You can add any specific project property here -->
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.lucene</groupId>
|
||||||
|
<artifactId>lucene-analyzers-phonetic</artifactId>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-assembly-plugin</artifactId>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,26 @@
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
<assembly>
|
||||||
|
<id>plugin</id>
|
||||||
|
<formats>
|
||||||
|
<format>zip</format>
|
||||||
|
</formats>
|
||||||
|
<includeBaseDirectory>false</includeBaseDirectory>
|
||||||
|
<dependencySets>
|
||||||
|
<dependencySet>
|
||||||
|
<outputDirectory>/</outputDirectory>
|
||||||
|
<useProjectArtifact>true</useProjectArtifact>
|
||||||
|
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||||
|
<excludes>
|
||||||
|
<exclude>org.elasticsearch:elasticsearch</exclude>
|
||||||
|
</excludes>
|
||||||
|
</dependencySet>
|
||||||
|
<dependencySet>
|
||||||
|
<outputDirectory>/</outputDirectory>
|
||||||
|
<useProjectArtifact>true</useProjectArtifact>
|
||||||
|
<useTransitiveFiltering>true</useTransitiveFiltering>
|
||||||
|
<includes>
|
||||||
|
<include>org.apache.lucene:lucene-analyzers-phonetic</include>
|
||||||
|
</includes>
|
||||||
|
</dependencySet>
|
||||||
|
</dependencySets>
|
||||||
|
</assembly>
|
|
@ -0,0 +1,30 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*/
|
||||||
|
public class PhoneticAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||||
|
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,131 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.Encoder;
|
||||||
|
import org.apache.commons.codec.language.*;
|
||||||
|
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
|
||||||
|
import org.apache.commons.codec.language.bm.NameType;
|
||||||
|
import org.apache.commons.codec.language.bm.PhoneticEngine;
|
||||||
|
import org.apache.commons.codec.language.bm.RuleType;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
|
||||||
|
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
||||||
|
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
||||||
|
import org.elasticsearch.common.inject.Inject;
|
||||||
|
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.analysis.phonetic.HaasePhonetik;
|
||||||
|
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
|
||||||
|
import org.elasticsearch.index.analysis.phonetic.Nysiis;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettings;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
|
private final Encoder encoder;
|
||||||
|
private final boolean replace;
|
||||||
|
private int maxcodelength;
|
||||||
|
private String[] languageset;
|
||||||
|
private NameType nametype;
|
||||||
|
private RuleType ruletype;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PhoneticTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||||
|
super(index, indexSettings, name, settings);
|
||||||
|
this.languageset = null;
|
||||||
|
this.nametype = null;
|
||||||
|
this.ruletype = null;
|
||||||
|
this.maxcodelength = 0;
|
||||||
|
this.replace = settings.getAsBoolean("replace", true);
|
||||||
|
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
|
||||||
|
String encodername = settings.get("encoder", "metaphone");
|
||||||
|
if ("metaphone".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new Metaphone();
|
||||||
|
} else if ("soundex".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new Soundex();
|
||||||
|
} else if ("caverphone1".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new Caverphone1();
|
||||||
|
} else if ("caverphone2".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new Caverphone2();
|
||||||
|
} else if ("caverphone".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new Caverphone2();
|
||||||
|
} else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new RefinedSoundex();
|
||||||
|
} else if ("cologne".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new ColognePhonetic();
|
||||||
|
} else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = null;
|
||||||
|
this.maxcodelength = settings.getAsInt("max_code_len", 4);
|
||||||
|
} else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = null;
|
||||||
|
this.languageset = settings.getAsArray("languageset");
|
||||||
|
String ruleType = settings.get("rule_type", "approx");
|
||||||
|
if ("approx".equalsIgnoreCase(ruleType)) {
|
||||||
|
ruletype = RuleType.APPROX;
|
||||||
|
} else if ("exact".equalsIgnoreCase(ruleType)) {
|
||||||
|
ruletype = RuleType.EXACT;
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("No matching rule type [" + ruleType + "] for beider morse encoder");
|
||||||
|
}
|
||||||
|
String nameType = settings.get("name_type", "generic");
|
||||||
|
if ("GENERIC".equalsIgnoreCase(nameType)) {
|
||||||
|
nametype = NameType.GENERIC;
|
||||||
|
} else if ("ASHKENAZI".equalsIgnoreCase(nameType)) {
|
||||||
|
nametype = NameType.ASHKENAZI;
|
||||||
|
} else if ("SEPHARDIC".equalsIgnoreCase(nameType)) {
|
||||||
|
nametype = NameType.SEPHARDIC;
|
||||||
|
}
|
||||||
|
} else if ("koelnerphonetik".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new KoelnerPhonetik();
|
||||||
|
} else if ("haasephonetik".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new HaasePhonetik();
|
||||||
|
} else if ("nysiis".equalsIgnoreCase(encodername)) {
|
||||||
|
this.encoder = new Nysiis();
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
if (encoder == null) {
|
||||||
|
if (ruletype != null && nametype != null) {
|
||||||
|
if (languageset != null) {
|
||||||
|
final LanguageSet languages = LanguageSet.from(new HashSet(Arrays.asList(languageset)));
|
||||||
|
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
|
||||||
|
}
|
||||||
|
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
|
||||||
|
}
|
||||||
|
if (maxcodelength > 0) {
|
||||||
|
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return new PhoneticFilter(tokenStream, encoder, !replace);
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("encoder error");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,71 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis.phonetic;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Geänderter Algorithmus aus der Matching Toolbox von Rainer Schnell
|
||||||
|
* Java-Programmierung von Jörg Reiher
|
||||||
|
*
|
||||||
|
* Die Kölner Phonetik wurde für den Einsatz in Namensdatenbanken wie
|
||||||
|
* der Verwaltung eines Krankenhauses durch Martin Haase (Institut für
|
||||||
|
* Sprachwissenschaft, Universität zu Köln) und Kai Heitmann (Insitut für
|
||||||
|
* medizinische Statistik, Informatik und Epidemiologie, Köln) überarbeitet.
|
||||||
|
* M. Haase und K. Heitmann. Die Erweiterte Kölner Phonetik. 526, 2000.
|
||||||
|
*
|
||||||
|
* nach: Martin Wilz, Aspekte der Kodierung phonetischer Ähnlichkeiten
|
||||||
|
* in deutschen Eigennamen, Magisterarbeit.
|
||||||
|
* http://www.uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:joergprante@gmail.com">Jörg Prante</a>
|
||||||
|
*/
|
||||||
|
public class HaasePhonetik extends KoelnerPhonetik {
|
||||||
|
|
||||||
|
private final static String[] HAASE_VARIATIONS_PATTERNS = {"OWN", "RB", "WSK", "A$", "O$", "SCH",
|
||||||
|
"GLI", "EAU$", "^CH", "AUX", "EUX", "ILLE"};
|
||||||
|
private final static String[] HAASE_VARIATIONS_REPLACEMENTS = {"AUN", "RW", "RSK", "AR", "OW", "CH",
|
||||||
|
"LI", "O", "SCH", "O", "O", "I"};
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected String[] getPatterns() {
|
||||||
|
return HAASE_VARIATIONS_PATTERNS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected String[] getReplacements() {
|
||||||
|
return HAASE_VARIATIONS_REPLACEMENTS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected char getCode() {
|
||||||
|
return '9';
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,324 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis.phonetic;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.EncoderException;
|
||||||
|
import org.apache.commons.codec.StringEncoder;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kölner Phonetik
|
||||||
|
*
|
||||||
|
* H.J. Postel, Die Kölner Phonetik. Ein Verfahren zu Identifizierung
|
||||||
|
* von Personennamen auf der Grundlage der Gestaltanalyse. IBM-Nachrichten 19 (1969), 925-931
|
||||||
|
*
|
||||||
|
* Algorithmus aus der Matching Toolbox von Rainer Schnell
|
||||||
|
* Java-Programmierung von Jörg Reiher
|
||||||
|
*
|
||||||
|
* mit Änderungen von Jörg Prante
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class KoelnerPhonetik implements StringEncoder {
|
||||||
|
|
||||||
|
private static final String[] POSTEL_VARIATIONS_PATTERNS = {"AUN", "OWN", "RB", "RW", "WSK", "RSK"};
|
||||||
|
private static final String[] POSTEL_VARIATIONS_REPLACEMENTS = {"OWN", "AUN", "RW", "RB", "RSK", "WSK"};
|
||||||
|
private Pattern[] variationsPatterns;
|
||||||
|
private boolean primary = false;
|
||||||
|
private final Set<Character> csz = new HashSet(Arrays.asList(
|
||||||
|
'C', 'S', 'Z'));
|
||||||
|
private final Set<Character> ckq = new HashSet(Arrays.asList(
|
||||||
|
'C', 'K', 'Q'));
|
||||||
|
private final Set<Character> aouhkxq = new HashSet(Arrays.asList(
|
||||||
|
'A', 'O', 'U', 'H', 'K', 'X', 'Q'));
|
||||||
|
private final Set<Character> ahkloqrux = new HashSet(Arrays.asList(
|
||||||
|
'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for Kölner Phonetik
|
||||||
|
*/
|
||||||
|
public KoelnerPhonetik() {
|
||||||
|
init();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param useOnlyPrimaryCode
|
||||||
|
*/
|
||||||
|
public KoelnerPhonetik(boolean useOnlyPrimaryCode) {
|
||||||
|
this();
|
||||||
|
this.primary = useOnlyPrimaryCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get variation patterns
|
||||||
|
*
|
||||||
|
* @return string array of variations
|
||||||
|
*/
|
||||||
|
protected String[] getPatterns() {
|
||||||
|
return POSTEL_VARIATIONS_PATTERNS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
protected String[] getReplacements() {
|
||||||
|
return POSTEL_VARIATIONS_REPLACEMENTS;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
protected char getCode() {
|
||||||
|
return '0';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param o1
|
||||||
|
* @param o2
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public double getRelativeValue(Object o1, Object o2) {
|
||||||
|
String[] kopho1 = code(expandUmlauts(o1.toString().toUpperCase(Locale.GERMANY)));
|
||||||
|
String[] kopho2 = code(expandUmlauts(o2.toString().toUpperCase(Locale.GERMANY)));
|
||||||
|
for (int i = 0; i < kopho1.length; i++) {
|
||||||
|
for (int ii = 0; ii < kopho2.length; ii++) {
|
||||||
|
if (kopho1[i].equals(kopho2[ii])) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object encode(Object str) throws EncoderException {
|
||||||
|
return encode((String) str);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String encode(String str) throws EncoderException {
|
||||||
|
if (str == null) return null;
|
||||||
|
String[] s = code(str.toString());
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < s.length; i++) {
|
||||||
|
sb.append(s[i]);
|
||||||
|
if (i < s.length - 1) {
|
||||||
|
sb.append('_');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void init() {
|
||||||
|
this.variationsPatterns = new Pattern[getPatterns().length];
|
||||||
|
for (int i = 0; i < getPatterns().length; i++) {
|
||||||
|
this.variationsPatterns[i] = Pattern.compile(getPatterns()[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String[] code(String str) {
|
||||||
|
List<String> parts = partition(str);
|
||||||
|
String[] codes = new String[parts.size()];
|
||||||
|
int i = 0;
|
||||||
|
for (String s : parts) {
|
||||||
|
codes[i++] = substitute(s);
|
||||||
|
}
|
||||||
|
return codes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> partition(String str) {
|
||||||
|
String primaryForm = str;
|
||||||
|
List<String> parts = new ArrayList();
|
||||||
|
parts.add(primaryForm.replaceAll("[^\\p{L}\\p{N}]", ""));
|
||||||
|
if (!primary) {
|
||||||
|
List<String> tmpParts = new ArrayList();
|
||||||
|
tmpParts.addAll((Arrays.asList(str.split("[\\p{Z}\\p{C}\\p{P}]"))));
|
||||||
|
int numberOfParts = tmpParts.size();
|
||||||
|
while (tmpParts.size() > 0) {
|
||||||
|
StringBuilder part = new StringBuilder();
|
||||||
|
for (int i = 0; i < tmpParts.size(); i++) {
|
||||||
|
part.append(tmpParts.get(i));
|
||||||
|
if (!(i + 1 == numberOfParts)) {
|
||||||
|
parts.add(part.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tmpParts.remove(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
List<String> variations = new ArrayList();
|
||||||
|
for (int i = 0; i < parts.size(); i++) {
|
||||||
|
List variation = getVariations(parts.get(i));
|
||||||
|
if (variation != null) {
|
||||||
|
variations.addAll(variation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return variations;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List getVariations(String str) {
|
||||||
|
int position = 0;
|
||||||
|
List<String> variations = new ArrayList();
|
||||||
|
variations.add("");
|
||||||
|
while (position < str.length()) {
|
||||||
|
int i = 0;
|
||||||
|
int substPos = -1;
|
||||||
|
while (substPos < position && i < getPatterns().length) {
|
||||||
|
Matcher m = variationsPatterns[i].matcher(str);
|
||||||
|
while (substPos < position && m.find()) {
|
||||||
|
substPos = m.start();
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (substPos >= position) {
|
||||||
|
i--;
|
||||||
|
List<String> varNew = new ArrayList();
|
||||||
|
String prevPart = str.substring(position, substPos);
|
||||||
|
for (int ii = 0; ii < variations.size(); ii++) {
|
||||||
|
String tmp = variations.get(ii);
|
||||||
|
varNew.add(tmp.concat(prevPart + getReplacements()[i]));
|
||||||
|
variations.set(ii, variations.get(ii) + prevPart + getPatterns()[i]);
|
||||||
|
}
|
||||||
|
variations.addAll(varNew);
|
||||||
|
position = substPos + getPatterns()[i].length();
|
||||||
|
} else {
|
||||||
|
for (int ii = 0; ii < variations.size(); ii++) {
|
||||||
|
variations.set(ii, variations.get(ii) + str.substring(position, str.length()));
|
||||||
|
}
|
||||||
|
position = str.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return variations;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String substitute(String str) {
|
||||||
|
String s = expandUmlauts(str.toUpperCase(Locale.GERMAN));
|
||||||
|
s = removeSequences(s);
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
char current = s.charAt(i);
|
||||||
|
char next = i + 1 < s.length() ? s.charAt(i + 1) : '_';
|
||||||
|
char prev = i > 0 ? s.charAt(i - 1) : '_';
|
||||||
|
switch (current) {
|
||||||
|
case 'A':
|
||||||
|
case 'E':
|
||||||
|
case 'I':
|
||||||
|
case 'J':
|
||||||
|
case 'Y':
|
||||||
|
case 'O':
|
||||||
|
case 'U':
|
||||||
|
if (i == 0 || ((i == 1) && prev == 'H')) {
|
||||||
|
sb.append(getCode());
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'P':
|
||||||
|
sb.append(next == 'H' ? "33" : '1');
|
||||||
|
break;
|
||||||
|
case 'B':
|
||||||
|
sb.append('1');
|
||||||
|
break;
|
||||||
|
case 'D':
|
||||||
|
case 'T':
|
||||||
|
sb.append(csz.contains(next) ? '8' : '2');
|
||||||
|
break;
|
||||||
|
case 'F':
|
||||||
|
case 'V':
|
||||||
|
case 'W':
|
||||||
|
sb.append('3');
|
||||||
|
break;
|
||||||
|
case 'G':
|
||||||
|
case 'K':
|
||||||
|
case 'Q':
|
||||||
|
sb.append('4');
|
||||||
|
break;
|
||||||
|
case 'C':
|
||||||
|
if (i == 0) {
|
||||||
|
sb.append(ahkloqrux.contains(next) ? '4' : '8');
|
||||||
|
} else {
|
||||||
|
sb.append(aouhkxq.contains(next) ? '4' : '8');
|
||||||
|
}
|
||||||
|
if (sb.length() >= 2 && sb.charAt(sb.length() - 2) == '8') {
|
||||||
|
sb.setCharAt(sb.length() - 1, '8');
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'X':
|
||||||
|
sb.append(i < 1 || !ckq.contains(prev) ? "48" : '8');
|
||||||
|
break;
|
||||||
|
case 'L':
|
||||||
|
sb.append('5');
|
||||||
|
break;
|
||||||
|
case 'M':
|
||||||
|
case 'N':
|
||||||
|
sb.append('6');
|
||||||
|
break;
|
||||||
|
case 'R':
|
||||||
|
sb.append('7');
|
||||||
|
break;
|
||||||
|
case 'S':
|
||||||
|
case 'Z':
|
||||||
|
sb.append('8');
|
||||||
|
break;
|
||||||
|
case 'H':
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s = sb.toString();
|
||||||
|
s = removeSequences(s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param str
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private String expandUmlauts(String str) {
|
||||||
|
return str.replaceAll("\u00C4", "AE").replaceAll("\u00D6", "OE").replaceAll("\u00DC", "UE");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param str
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private String removeSequences(String str) {
|
||||||
|
if (str == null || str.length() == 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
int i = 0, j = 0;
|
||||||
|
StringBuilder sb = new StringBuilder().append(str.charAt(i++));
|
||||||
|
char c;
|
||||||
|
while (i < str.length()) {
|
||||||
|
c = str.charAt(i);
|
||||||
|
if (c != sb.charAt(j)) {
|
||||||
|
sb.append(c);
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,329 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis.phonetic;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.EncoderException;
|
||||||
|
import org.apache.commons.codec.StringEncoder;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Taken from commons-codec trunk (unreleased yet)
|
||||||
|
*
|
||||||
|
* Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate
|
||||||
|
* similar names, but can also be used as a general purpose scheme to find word
|
||||||
|
* with similar phonemes.
|
||||||
|
*
|
||||||
|
* <p> NYSIIS features an accuracy increase of 2.7% over the traditional Soundex
|
||||||
|
* algorithm. </p>
|
||||||
|
*
|
||||||
|
* <p>Algorithm description:
|
||||||
|
* <pre>
|
||||||
|
* 1. Transcode first characters of name
|
||||||
|
* 1a. MAC -> MCC
|
||||||
|
* 1b. KN -> NN
|
||||||
|
* 1c. K -> C
|
||||||
|
* 1d. PH -> FF
|
||||||
|
* 1e. PF -> FF
|
||||||
|
* 1f. SCH -> SSS
|
||||||
|
* 2. Transcode last characters of name
|
||||||
|
* 2a. EE, IE -> Y
|
||||||
|
* 2b. DT,RT,RD,NT,ND -> D
|
||||||
|
* 3. First character of key = first character of name
|
||||||
|
* 4. Transcode remaining characters by following these rules, incrementing by one character each time
|
||||||
|
* 4a. EV -> AF else A,E,I,O,U -> A
|
||||||
|
* 4b. Q -> G
|
||||||
|
* 4c. Z -> S
|
||||||
|
* 4d. M -> N
|
||||||
|
* 4e. KN -> N else K -> C
|
||||||
|
* 4f. SCH -> SSS
|
||||||
|
* 4g. PH -> FF
|
||||||
|
* 4h. H -> If previous or next is nonvowel, previous
|
||||||
|
* 4i. W -> If previous is vowel, previous
|
||||||
|
* 4j. Add current to key if current != last key character
|
||||||
|
* 5. If last character is S, remove it
|
||||||
|
* 6. If last characters are AY, replace with Y
|
||||||
|
* 7. If last character is A, remove it
|
||||||
|
* 8. Collapse all strings of repeated characters
|
||||||
|
* 9. Add original first character of name as first character of key
|
||||||
|
* </pre></p>
|
||||||
|
*
|
||||||
|
* @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
|
||||||
|
* @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class Nysiis implements StringEncoder {
|
||||||
|
|
||||||
|
private static final char[] CHARS_A = new char[]{'A'};
|
||||||
|
private static final char[] CHARS_AF = new char[]{'A', 'F'};
|
||||||
|
private static final char[] CHARS_C = new char[]{'C'};
|
||||||
|
private static final char[] CHARS_FF = new char[]{'F', 'F'};
|
||||||
|
private static final char[] CHARS_G = new char[]{'G'};
|
||||||
|
private static final char[] CHARS_N = new char[]{'N'};
|
||||||
|
private static final char[] CHARS_NN = new char[]{'N', 'N'};
|
||||||
|
private static final char[] CHARS_S = new char[]{'S'};
|
||||||
|
private static final char[] CHARS_SSS = new char[]{'S', 'S', 'S'};
|
||||||
|
private static final Pattern PAT_MAC = Pattern.compile("^MAC");
|
||||||
|
private static final Pattern PAT_KN = Pattern.compile("^KN");
|
||||||
|
private static final Pattern PAT_K = Pattern.compile("^K");
|
||||||
|
private static final Pattern PAT_PH_PF = Pattern.compile("^(PH|PF)");
|
||||||
|
private static final Pattern PAT_SCH = Pattern.compile("^SCH");
|
||||||
|
private static final Pattern PAT_EE_IE = Pattern.compile("(EE|IE)$");
|
||||||
|
private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
|
||||||
|
private static final char SPACE = ' ';
|
||||||
|
private static final int TRUE_LENGTH = 6;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests if the given character is a vowel.
|
||||||
|
*
|
||||||
|
* @param c the character to test
|
||||||
|
* @return {@code true} if the character is a vowel, {@code false} otherwise
|
||||||
|
*/
|
||||||
|
private static boolean isVowel(final char c) {
|
||||||
|
return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcodes the remaining parts of the String. The method operates on a
|
||||||
|
* sliding window, looking at 4 characters at a time: [i-1, i, i+1, i+2].
|
||||||
|
*
|
||||||
|
* @param prev the previous character
|
||||||
|
* @param curr the current character
|
||||||
|
* @param next the next character
|
||||||
|
* @param aNext the after next character
|
||||||
|
* @return a transcoded array of characters, starting from the current
|
||||||
|
* position
|
||||||
|
*/
|
||||||
|
private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
|
||||||
|
// 1. EV -> AF
|
||||||
|
if (curr == 'E' && next == 'V') {
|
||||||
|
return CHARS_AF;
|
||||||
|
}
|
||||||
|
|
||||||
|
// A, E, I, O, U -> A
|
||||||
|
if (isVowel(curr)) {
|
||||||
|
return CHARS_A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Q -> G, Z -> S, M -> N
|
||||||
|
if (curr == 'Q') {
|
||||||
|
return CHARS_G;
|
||||||
|
} else if (curr == 'Z') {
|
||||||
|
return CHARS_S;
|
||||||
|
} else if (curr == 'M') {
|
||||||
|
return CHARS_N;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. KN -> NN else K -> C
|
||||||
|
if (curr == 'K') {
|
||||||
|
if (next == 'N') {
|
||||||
|
return CHARS_NN;
|
||||||
|
} else {
|
||||||
|
return CHARS_C;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. SCH -> SSS
|
||||||
|
if (curr == 'S' && next == 'C' && aNext == 'H') {
|
||||||
|
return CHARS_SSS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// PH -> FF
|
||||||
|
if (curr == 'P' && next == 'H') {
|
||||||
|
return CHARS_FF;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. H -> If previous or next is a non vowel, previous.
|
||||||
|
if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
|
||||||
|
return new char[]{prev};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 6. W -> If previous is vowel, previous.
|
||||||
|
if (curr == 'W' && isVowel(prev)) {
|
||||||
|
return new char[]{prev};
|
||||||
|
}
|
||||||
|
|
||||||
|
return new char[]{curr};
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Indicates the strict mode.
|
||||||
|
*/
|
||||||
|
private final boolean strict;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an instance of the {@link Nysiis} encoder with strict mode
|
||||||
|
* (original form), i.e. encoded strings have a maximum length of 6.
|
||||||
|
*/
|
||||||
|
public Nysiis() {
|
||||||
|
this(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an instance of the {@link Nysiis} encoder with the specified
|
||||||
|
* strict mode:
|
||||||
|
*
|
||||||
|
* <ul> <li>{@code true}: encoded strings have a maximum length of 6</li> <li>{@code false}:
|
||||||
|
* encoded strings may have arbitrary length</li> </ul>
|
||||||
|
*
|
||||||
|
* @param strict the strict mode
|
||||||
|
*/
|
||||||
|
public Nysiis(final boolean strict) {
|
||||||
|
this.strict = strict;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes an Object using the NYSIIS algorithm. This method is provided in
|
||||||
|
* order to satisfy the requirements of the Encoder interface, and will
|
||||||
|
* throw an {@link EncoderException} if the supplied object is not of type
|
||||||
|
* {@link String}.
|
||||||
|
*
|
||||||
|
* @param obj Object to encode
|
||||||
|
* @return An object (or a {@link String}) containing the NYSIIS code which
|
||||||
|
* corresponds to the given String.
|
||||||
|
* @throws EncoderException if the parameter supplied is not of a {@link String}
|
||||||
|
* @throws IllegalArgumentException if a character is not mapped
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public Object encode(Object obj) throws EncoderException {
|
||||||
|
if (!(obj instanceof String)) {
|
||||||
|
throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
|
||||||
|
}
|
||||||
|
return this.nysiis((String) obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes a String using the NYSIIS algorithm.
|
||||||
|
*
|
||||||
|
* @param str A String object to encode
|
||||||
|
* @return A Nysiis code corresponding to the String supplied
|
||||||
|
* @throws IllegalArgumentException if a character is not mapped
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String encode(String str) {
|
||||||
|
return this.nysiis(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates the strict mode for this {@link Nysiis} encoder.
|
||||||
|
*
|
||||||
|
* @return {@code true} if the encoder is configured for strict mode, {@code false}
|
||||||
|
* otherwise
|
||||||
|
*/
|
||||||
|
public boolean isStrict() {
|
||||||
|
return this.strict;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the NYSIIS code for a given String object.
|
||||||
|
*
|
||||||
|
* @param str String to encode using the NYSIIS algorithm
|
||||||
|
* @return A NYSIIS code for the String supplied
|
||||||
|
*/
|
||||||
|
public String nysiis(String str) {
|
||||||
|
if (str == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the same clean rules as Soundex
|
||||||
|
str = clean(str);
|
||||||
|
|
||||||
|
if (str.length() == 0) {
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translate first characters of name:
|
||||||
|
// MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
|
||||||
|
str = PAT_MAC.matcher(str).replaceFirst("MCC");
|
||||||
|
str = PAT_KN.matcher(str).replaceFirst("NN");
|
||||||
|
str = PAT_K.matcher(str).replaceFirst("C");
|
||||||
|
str = PAT_PH_PF.matcher(str).replaceFirst("FF");
|
||||||
|
str = PAT_SCH.matcher(str).replaceFirst("SSS");
|
||||||
|
|
||||||
|
// Translate last characters of name:
|
||||||
|
// EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
|
||||||
|
str = PAT_EE_IE.matcher(str).replaceFirst("Y");
|
||||||
|
str = PAT_DT_ETC.matcher(str).replaceFirst("D");
|
||||||
|
|
||||||
|
// First character of key = first character of name.
|
||||||
|
StringBuffer key = new StringBuffer(str.length());
|
||||||
|
key.append(str.charAt(0));
|
||||||
|
|
||||||
|
// Transcode remaining characters, incrementing by one character each time
|
||||||
|
final char[] chars = str.toCharArray();
|
||||||
|
final int len = chars.length;
|
||||||
|
|
||||||
|
for (int i = 1; i < len; i++) {
|
||||||
|
final char next = i < len - 1 ? chars[i + 1] : SPACE;
|
||||||
|
final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
|
||||||
|
final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
|
||||||
|
System.arraycopy(transcoded, 0, chars, i, transcoded.length);
|
||||||
|
|
||||||
|
// only append the current char to the key if it is different from the last one
|
||||||
|
if (chars[i] != chars[i - 1]) {
|
||||||
|
key.append(chars[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (key.length() > 1) {
|
||||||
|
char lastChar = key.charAt(key.length() - 1);
|
||||||
|
|
||||||
|
// If last character is S, remove it.
|
||||||
|
if (lastChar == 'S') {
|
||||||
|
key.deleteCharAt(key.length() - 1);
|
||||||
|
lastChar = key.charAt(key.length() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (key.length() > 2) {
|
||||||
|
final char last2Char = key.charAt(key.length() - 2);
|
||||||
|
// If last characters are AY, replace with Y.
|
||||||
|
if (last2Char == 'A' && lastChar == 'Y') {
|
||||||
|
key.deleteCharAt(key.length() - 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If last character is A, remove it.
|
||||||
|
if (lastChar == 'A') {
|
||||||
|
key.deleteCharAt(key.length() - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final String string = key.toString();
|
||||||
|
return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
|
||||||
|
}
|
||||||
|
|
||||||
|
static String clean(String str) {
|
||||||
|
if (str == null || str.length() == 0) {
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
int len = str.length();
|
||||||
|
char[] chars = new char[len];
|
||||||
|
int count = 0;
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
if (Character.isLetter(str.charAt(i))) {
|
||||||
|
chars[count++] = str.charAt(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (count == len) {
|
||||||
|
return str.toUpperCase(java.util.Locale.ENGLISH);
|
||||||
|
}
|
||||||
|
return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.plugin.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisModule;
|
||||||
|
import org.elasticsearch.index.analysis.PhoneticAnalysisBinderProcessor;
|
||||||
|
import org.elasticsearch.plugins.AbstractPlugin;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*/
|
||||||
|
public class AnalysisPhoneticPlugin extends AbstractPlugin {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return "analysis-phonetic";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String description() {
|
||||||
|
return "Phonetic analysis support";
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onModule(AnalysisModule module) {
|
||||||
|
module.addProcessor(new PhoneticAnalysisBinderProcessor());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
plugin=org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin
|
||||||
|
version=${project.version}
|
||||||
|
lucene=${lucene.version}
|
|
@ -0,0 +1,72 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||||
|
import org.elasticsearch.common.inject.Injector;
|
||||||
|
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.settings.SettingsModule;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.env.EnvironmentModule;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.IndexNameModule;
|
||||||
|
import org.elasticsearch.index.settings.IndexSettingsModule;
|
||||||
|
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
|
||||||
|
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
|
||||||
|
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||||
|
import org.hamcrest.MatcherAssert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import static org.elasticsearch.common.settings.Settings.settingsBuilder;
|
||||||
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*/
|
||||||
|
public class SimplePhoneticAnalysisTests extends ElasticsearchTestCase {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPhoneticTokenFilterFactory() {
|
||||||
|
Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/index/analysis/phonetic-1.yml")
|
||||||
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||||
|
.put("path.home", createTempDir())
|
||||||
|
.build();
|
||||||
|
AnalysisService analysisService = testSimpleConfiguration(settings);
|
||||||
|
TokenFilterFactory filterFactory = analysisService.tokenFilter("phonetic");
|
||||||
|
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
private AnalysisService testSimpleConfiguration(Settings settings) {
|
||||||
|
Index index = new Index("test");
|
||||||
|
|
||||||
|
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
|
||||||
|
new EnvironmentModule(new Environment(settings)),
|
||||||
|
new IndicesAnalysisModule()).createInjector();
|
||||||
|
Injector injector = new ModulesBuilder().add(
|
||||||
|
new IndexSettingsModule(index, settings),
|
||||||
|
new IndexNameModule(index),
|
||||||
|
new AnalysisModule(settings, parentInjector.getInstance(IndicesAnalysisService.class))
|
||||||
|
.addProcessor(new PhoneticAnalysisBinderProcessor())).createChildInjector(parentInjector);
|
||||||
|
|
||||||
|
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
|
||||||
|
return analysisService;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,108 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
|
||||||
|
import org.elasticsearch.action.search.SearchResponse;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
|
import org.elasticsearch.index.query.QueryBuilders;
|
||||||
|
import org.elasticsearch.plugins.PluginsService;
|
||||||
|
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
|
||||||
|
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||||
|
import static org.hamcrest.CoreMatchers.is;
|
||||||
|
import static org.hamcrest.CoreMatchers.notNullValue;
|
||||||
|
|
||||||
|
@ElasticsearchIntegrationTest.ClusterScope(numDataNodes = 1, scope = ElasticsearchIntegrationTest.Scope.SUITE)
|
||||||
|
public class SimplePhoneticIntegrationTests extends ElasticsearchIntegrationTest {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Settings nodeSettings(int nodeOrdinal) {
|
||||||
|
return Settings.builder()
|
||||||
|
.put(super.nodeSettings(nodeOrdinal))
|
||||||
|
.put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Settings indexSettings() {
|
||||||
|
Settings settings = Settings.builder()
|
||||||
|
.put(super.indexSettings())
|
||||||
|
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
|
||||||
|
.putArray("index.analysis.analyzer.my_analyzer.filter", "standard", "lowercase", "my_metaphone")
|
||||||
|
.put("index.analysis.filter.my_metaphone.type", "phonetic")
|
||||||
|
.put("index.analysis.filter.my_metaphone.encoder", "metaphone")
|
||||||
|
.put("index.analysis.filter.my_metaphone.replace", false)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
return settings;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPhoneticAnalyzer() throws ExecutionException, InterruptedException {
|
||||||
|
createIndex("test");
|
||||||
|
ensureGreen("test");
|
||||||
|
AnalyzeResponse response = client().admin().indices()
|
||||||
|
.prepareAnalyze("hello world")
|
||||||
|
.setIndex("test")
|
||||||
|
.setAnalyzer("my_analyzer")
|
||||||
|
.execute().get();
|
||||||
|
|
||||||
|
assertThat(response, notNullValue());
|
||||||
|
assertThat(response.getTokens().size(), is(4));
|
||||||
|
assertThat(response.getTokens().get(0).getTerm(), is("HL"));
|
||||||
|
assertThat(response.getTokens().get(1).getTerm(), is("hello"));
|
||||||
|
assertThat(response.getTokens().get(2).getTerm(), is("WRLT"));
|
||||||
|
assertThat(response.getTokens().get(3).getTerm(), is("world"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPhoneticAnalyzerInMapping() throws ExecutionException, InterruptedException, IOException {
|
||||||
|
createIndex("test");
|
||||||
|
ensureGreen("test");
|
||||||
|
final XContentBuilder mapping = jsonBuilder().startObject()
|
||||||
|
.startObject("type")
|
||||||
|
.startObject("properties")
|
||||||
|
.startObject("foo")
|
||||||
|
.field("type", "string")
|
||||||
|
.field("analyzer", "my_analyzer")
|
||||||
|
.endObject()
|
||||||
|
.endObject()
|
||||||
|
.endObject()
|
||||||
|
.endObject();
|
||||||
|
|
||||||
|
client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get();
|
||||||
|
|
||||||
|
index("test", "type", "1", "foo", "hello world");
|
||||||
|
refresh();
|
||||||
|
|
||||||
|
SearchResponse response = client().prepareSearch("test").setQuery(
|
||||||
|
QueryBuilders.matchQuery("foo", "helllo")
|
||||||
|
).execute().actionGet();
|
||||||
|
|
||||||
|
assertThat(response.getHits().getTotalHits(), is(1L));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
index:
|
||||||
|
analysis:
|
||||||
|
filter:
|
||||||
|
doublemetaphonefilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: doublemetaphone
|
||||||
|
metaphonefilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: metaphone
|
||||||
|
soundexfilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: soundex
|
||||||
|
refinedsoundexfilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: refinedsoundex
|
||||||
|
caverphonefilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: caverphone
|
||||||
|
beidermorsefilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: beidermorse
|
||||||
|
koelnerphonetikfilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: koelnerphonetik
|
||||||
|
haasephonetikfilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: haasephonetik
|
||||||
|
nysiisfilter:
|
||||||
|
type: phonetic
|
||||||
|
encoder: nysiis
|
Loading…
Reference in New Issue