diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7fde7b4db3b..8854cbe9fae 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -79,6 +79,9 @@ New Features * LUCENE-6053: Add Serbian analyzer. (Nikola Smolenski via Robert Muir, Mike McCandless) +* LUCENE-4400: Add support for new NYSIIS Apache commons phonetic + codec (Thomas Neidhart via Mike McCandless) + API Changes * LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and diff --git a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java index b2620d9bb13..c357f723595 100644 --- a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java +++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java @@ -18,8 +18,8 @@ package org.apache.lucene.analysis.phonetic; */ import java.io.IOException; -import java.lang.reflect.Method; import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.HashMap; import java.util.Locale; import java.util.Map; @@ -29,6 +29,7 @@ import org.apache.commons.codec.language.Caverphone2; import org.apache.commons.codec.language.ColognePhonetic; import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.Nysiis; import org.apache.commons.codec.language.RefinedSoundex; import org.apache.commons.codec.language.Soundex; import org.apache.lucene.analysis.TokenStream; @@ -46,8 +47,8 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * This takes one required argument, "encoder", and the rest are optional: *
*
encoder
required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0), - * or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by - * itself if it already contains a '.' or otherwise as in the same package as these others.
+ * "ColognePhonetic" or "Nysiis" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name + * either by itself if it already contains a '.' or otherwise as in the same package as these others. *
inject
(default=true) add tokens to the stream with the offset=0
*
maxCodeLength
The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't * support this then specifying this is an error.
@@ -82,6 +83,7 @@ public class PhoneticFilterFactory extends TokenFilterFactory implements Resourc registry.put("RefinedSoundex".toUpperCase(Locale.ROOT), RefinedSoundex.class); registry.put("Caverphone".toUpperCase(Locale.ROOT), Caverphone2.class); registry.put("ColognePhonetic".toUpperCase(Locale.ROOT), ColognePhonetic.class); + registry.put("Nysiis".toUpperCase(Locale.ROOT), Nysiis.class); } final boolean inject; //accessed by the test diff --git a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java index a47e64990de..f5aff7a741e 100644 --- a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java +++ b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java @@ -21,7 +21,12 @@ import java.io.IOException; import java.io.StringReader; import org.apache.commons.codec.Encoder; -import org.apache.commons.codec.language.*; +import org.apache.commons.codec.language.Caverphone2; +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.Nysiis; +import org.apache.commons.codec.language.RefinedSoundex; +import org.apache.commons.codec.language.Soundex; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; @@ -59,6 +64,11 @@ public class TestPhoneticFilter extends BaseTokenStreamTestCase { "TTA1111111", "Datha", "KLN1111111", "Carlene" }); assertAlgorithm(new Caverphone2(), false, "Darda Karleen Datha Carlene", new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); + + assertAlgorithm(new Nysiis(), true, "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "B", "bbb", "C", "ccc", "EASGAS", "easgasg" }); + assertAlgorithm(new Nysiis(), false, "aaa bbb ccc easgasg", + new String[] { "A", "B", "C", "EASGAS" }); } diff --git a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java index c919da4b0f6..70b654d580d 100644 --- a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java +++ b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java @@ -18,14 +18,12 @@ package org.apache.lucene.analysis.phonetic; */ import java.io.IOException; -import java.io.StringReader; import java.util.HashMap; import java.util.Map; -import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.Caverphone2; +import org.apache.commons.codec.language.Metaphone; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.ClasspathResourceLoader; @@ -164,6 +162,12 @@ public class TestPhoneticFilterFactory extends BaseTokenStreamTestCase { "67", "Meir", "862", "Schmidt" }); assertAlgorithm("ColognePhonetic", "false", "Meier Schmitt Meir Schmidt", new String[] { "67", "862", "67", "862" }); + + assertAlgorithm("Nysiis", "true", "Macintosh Knuth Bart Hurd", + new String[] { "MCANT", "Macintosh", "NAT", "Knuth", + "BAD", "Bart", "HAD", "Hurd" }); + assertAlgorithm("Nysiis", "false", "Macintosh Knuth Bart Hurd", + new String[] { "MCANT", "NAT", "BAD", "HAD" }); } /** Test that bogus arguments result in exception */