From d5601eb3714d86b1baff2591a9e7a380f3c96c4a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 30 Oct 2011 01:00:06 +0000 Subject: [PATCH] SOLR-2276: Support for cologne phonetic git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1195082 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/eclipse/dot.classpath | 4 +- dev-tools/maven/pom.xml.template | 2 +- .../phonetic/lib/commons-codec-1.4.jar | 2 - .../phonetic/lib/commons-codec-1.5.jar | 2 + solr/CHANGES.txt | 3 + .../solr/analysis/PhoneticFilterFactory.java | 54 +++++++++++--- .../analysis/TestPhoneticFilterFactory.java | 73 +++++++++++++++++-- solr/lib/commons-codec-1.4.jar | 2 - solr/lib/commons-codec-1.5.jar | 2 + 9 files changed, 122 insertions(+), 22 deletions(-) delete mode 100644 modules/analysis/phonetic/lib/commons-codec-1.4.jar create mode 100644 modules/analysis/phonetic/lib/commons-codec-1.5.jar delete mode 100644 solr/lib/commons-codec-1.4.jar create mode 100644 solr/lib/commons-codec-1.5.jar diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index 4d36b791dac..58768413e34 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -87,7 +87,7 @@ - + @@ -98,7 +98,7 @@ - + diff --git a/dev-tools/maven/pom.xml.template b/dev-tools/maven/pom.xml.template index c805351c812..481eef4b71b 100644 --- a/dev-tools/maven/pom.xml.template +++ b/dev-tools/maven/pom.xml.template @@ -138,7 +138,7 @@ commons-codec commons-codec - 1.4 + 1.5 commons-collections diff --git a/modules/analysis/phonetic/lib/commons-codec-1.4.jar b/modules/analysis/phonetic/lib/commons-codec-1.4.jar deleted file mode 100644 index 97a58157492..00000000000 --- a/modules/analysis/phonetic/lib/commons-codec-1.4.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/phonetic/lib/commons-codec-1.5.jar b/modules/analysis/phonetic/lib/commons-codec-1.5.jar new file mode 100644 index 00000000000..3e3f8394814 --- /dev/null +++ b/modules/analysis/phonetic/lib/commons-codec-1.5.jar @@ -0,0 +1,2 @@ +AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 22109ed23e6..32a154ec937 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -380,6 +380,9 @@ New Features * SOLR-2818: Added before/after count response parsing support for range facets in SolrJ. (Bernhard Frauendienst via Martijn van Groningen) + +* SOLR-2276: Add support for cologne phonetic to PhoneticFilterFactory. + (Marc Pompl via rmuir) Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java index 624fbbf71b4..f9807554ab9 100644 --- a/solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java @@ -21,9 +21,12 @@ import java.lang.reflect.Method; import java.util.HashMap; import java.util.Locale; import java.util.Map; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.codec.Encoder; import org.apache.commons.codec.language.Caverphone; +import org.apache.commons.codec.language.ColognePhonetic; import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.RefinedSoundex; @@ -59,16 +62,18 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory { public static final String ENCODER = "encoder"; public static final String INJECT = "inject"; // boolean + private static final String PACKAGE_CONTAINIG_ENCODERS = "org.apache.commons.codec.language."; - private static final Map> registry; - static { - registry = new HashMap>(); - registry.put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class ); - registry.put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class ); - registry.put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class ); - registry.put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class ); - registry.put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone.class ); - } + private static final Map> registry = new HashMap>() + {{ + put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class ); + put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class ); + put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class ); + put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class ); + put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone.class ); + put( "ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class ); + }}; + private static final Lock lock = new ReentrantLock(); protected boolean inject = true; protected String name = null; @@ -87,7 +92,12 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory } Class clazz = registry.get(name.toUpperCase(Locale.ENGLISH)); if( clazz == null ) { - throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Unknown encoder: "+name +" ["+registry.keySet()+"]" ); + lock.lock(); + try { + clazz = resolveEncoder(name); + } finally { + lock.unlock(); + } } try { @@ -105,6 +115,30 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory } } + private Class resolveEncoder(String name) { + Class clazz = null; + try { + clazz = lookupEncoder(PACKAGE_CONTAINIG_ENCODERS+name); + } catch (ClassNotFoundException e) { + try { + clazz = lookupEncoder(name); + } catch (ClassNotFoundException cnfe) { + throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Unknown encoder: "+name +" ["+registry.keySet()+"]" ); + } + } + catch (ClassCastException e) { + throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Not an encoder: "+name +" ["+registry.keySet()+"]" ); + } + return clazz; + } + + private Class lookupEncoder(String name) + throws ClassNotFoundException { + Class clazz = Class.forName(name).asSubclass(Encoder.class); + registry.put( name.toUpperCase(Locale.ENGLISH), clazz ); + return clazz; + } + public PhoneticFilter create(TokenStream input) { return new PhoneticFilter(input,encoder,inject); } diff --git a/solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java b/solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java index 02e27e81dab..cd7eeb1c472 100644 --- a/solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java +++ b/solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java @@ -32,10 +32,34 @@ import org.apache.lucene.analysis.Tokenizer; */ public class TestPhoneticFilterFactory extends BaseTokenTestCase { + private static final int REPEATS = 100000; + + /** + * Case: default + */ public void testFactory() { Map args = new HashMap(); + PhoneticFilterFactory ff = new PhoneticFilterFactory(); + + args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); + ff.init( args ); + assertTrue( ff.encoder instanceof Metaphone ); + assertTrue( ff.inject ); // default + + args.put( PhoneticFilterFactory.INJECT, "false" ); + ff.init( args ); + assertFalse( ff.inject ); + } + + /** + * Case: Failures and Exceptions + */ + public void testFactoryCaseFailure() + { + Map args = new HashMap(); + PhoneticFilterFactory ff = new PhoneticFilterFactory(); try { ff.init( args ); @@ -48,15 +72,27 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase { fail( "unknown encoder parameter" ); } catch( Exception ex ) {} + args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.NonExistence" ); + try { + ff.init( args ); + fail( "unknown encoder parameter" ); + } + catch( Exception ex ) {} + } + + /** + * Case: Reflection + */ + public void testFactoryCaseReflection() + { + Map args = new HashMap(); - args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); + PhoneticFilterFactory ff = new PhoneticFilterFactory(); + + args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" ); ff.init( args ); assertTrue( ff.encoder instanceof Metaphone ); assertTrue( ff.inject ); // default - - args.put( PhoneticFilterFactory.INJECT, "false" ); - ff.init( args ); - assertFalse( ff.inject ); } public void testAlgorithms() throws Exception { @@ -85,6 +121,12 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase { "TTA1111111", "Datha", "KLN1111111", "Carlene" }); assertAlgorithm("Caverphone", "false", "Darda Karleen Datha Carlene", new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); + + assertAlgorithm("ColognePhonetic", "true", "Meier Schmitt Meir Schmidt", + new String[] { "67", "Meier", "862", "Schmitt", + "67", "Meir", "862", "Schmidt" }); + assertAlgorithm("ColognePhonetic", "false", "Meier Schmitt Meir Schmidt", + new String[] { "67", "862", "67", "862" }); } static void assertAlgorithm(String algName, String inject, String input, @@ -98,4 +140,25 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase { TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, expected); } + + public void testSpeed() throws Exception { + checkSpeedEncoding("Metaphone", "easgasg", "ESKS"); + checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS"); + checkSpeedEncoding("Soundex", "easgasg", "E220"); + checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034"); + checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111"); + checkSpeedEncoding("ColognePhonetic", "Schmitt", "862"); + } + + private void checkSpeedEncoding(String encoder, String toBeEncoded, String estimated) throws Exception { + long start = System.currentTimeMillis(); + for ( int i=0; i