From a488424404e1bfbe9d353353f5c9d7da9959e222 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Wed, 21 Dec 2011 03:53:44 +0200 Subject: [PATCH] Analysis: Add phonetic encodder called `bm` or `beider_morse`, closes #1552. --- pom.xml | 2 +- .../phonetic/PhoneticTokenFilterFactory.java | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index c2d40dd6936..694918e3ab3 100644 --- a/pom.xml +++ b/pom.xml @@ -124,7 +124,7 @@ commons-codec commons-codec - 1.5 + 1.6 compile diff --git a/src/main/java/org/elasticsearch/index/analysis/phonetic/PhoneticTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/phonetic/PhoneticTokenFilterFactory.java index e8c4657dfe0..b90671f5519 100644 --- a/src/main/java/org/elasticsearch/index/analysis/phonetic/PhoneticTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/phonetic/PhoneticTokenFilterFactory.java @@ -21,6 +21,9 @@ package org.elasticsearch.index.analysis.phonetic; import org.apache.commons.codec.Encoder; import org.apache.commons.codec.language.*; +import org.apache.commons.codec.language.bm.BeiderMorseEncoder; +import org.apache.commons.codec.language.bm.NameType; +import org.apache.commons.codec.language.bm.RuleType; import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.inject.Inject; @@ -67,6 +70,25 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { DoubleMetaphone doubleMetaphone = new DoubleMetaphone(); doubleMetaphone.setMaxCodeLen(settings.getAsInt("max_code_len", doubleMetaphone.getMaxCodeLen())); this.encoder = doubleMetaphone; + } else if ("bm".equalsIgnoreCase(encoder) || "beider_morse".equalsIgnoreCase(encoder)) { + BeiderMorseEncoder bm = new BeiderMorseEncoder(); + String ruleType = settings.get("rule_type", "approx"); + if ("approx".equalsIgnoreCase(ruleType)) { + bm.setRuleType(RuleType.APPROX); + } else if ("exact".equalsIgnoreCase(ruleType)) { + bm.setRuleType(RuleType.EXACT); + } else { + throw new ElasticSearchIllegalArgumentException("No matching rule type [" + ruleType + "] for beider morse encoder"); + } + String nameType = settings.get("name_type", "generic"); + if ("GENERIC".equalsIgnoreCase(nameType)) { + bm.setNameType(NameType.GENERIC); + } else if ("ASHKENAZI".equalsIgnoreCase(nameType)) { + bm.setNameType(NameType.ASHKENAZI); + } else if ("SEPHARDIC".equalsIgnoreCase(nameType)) { + bm.setNameType(NameType.SEPHARDIC); + } + this.encoder = bm; } else { throw new ElasticSearchIllegalArgumentException("unknown encoder [" + encoder + "] for phonetic token filter"); }