SOLR-2276: Support for cologne phonetic

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1195082 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-10-30 01:00:06 +00:00
parent f5ee5914b0
commit d5601eb371
9 changed files with 122 additions and 22 deletions

View File

@ -87,7 +87,7 @@
<classpathentry kind="lib" path="lucene/lib/junit-4.7.jar"/>
<classpathentry kind="lib" path="lucene/contrib/sandbox/lib/jakarta-regexp-1.4.jar"/>
<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8_1_1.jar"/>
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.5.jar"/>
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
@ -98,7 +98,7 @@
<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1099557.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-codec-1.4.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-codec-1.5.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-csv-1.0-SNAPSHOT-r966014.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
<classpathentry kind="lib" path="solr/lib/commons-httpclient-3.1.jar"/>

View File

@ -138,7 +138,7 @@
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.4</version>
<version>1.5</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>

View File

@ -1,2 +0,0 @@
AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
Apache SVN contains full history.

View File

@ -380,6 +380,9 @@ New Features
* SOLR-2818: Added before/after count response parsing support for range facets in
SolrJ. (Bernhard Frauendienst via Martijn van Groningen)
* SOLR-2276: Add support for cologne phonetic to PhoneticFilterFactory.
(Marc Pompl via rmuir)
Bug Fixes
----------------------

View File

@ -21,9 +21,12 @@ import java.lang.reflect.Method;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.Caverphone;
import org.apache.commons.codec.language.ColognePhonetic;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
@ -59,16 +62,18 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
{
public static final String ENCODER = "encoder";
public static final String INJECT = "inject"; // boolean
private static final String PACKAGE_CONTAINIG_ENCODERS = "org.apache.commons.codec.language.";
private static final Map<String, Class<? extends Encoder>> registry;
static {
registry = new HashMap<String, Class<? extends Encoder>>();
registry.put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class );
registry.put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class );
registry.put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class );
registry.put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class );
registry.put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone.class );
}
private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>()
{{
put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class );
put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class );
put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class );
put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class );
put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone.class );
put( "ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class );
}};
private static final Lock lock = new ReentrantLock();
protected boolean inject = true;
protected String name = null;
@ -87,7 +92,12 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
}
Class<? extends Encoder> clazz = registry.get(name.toUpperCase(Locale.ENGLISH));
if( clazz == null ) {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Unknown encoder: "+name +" ["+registry.keySet()+"]" );
lock.lock();
try {
clazz = resolveEncoder(name);
} finally {
lock.unlock();
}
}
try {
@ -105,6 +115,30 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
}
}
private Class<? extends Encoder> resolveEncoder(String name) {
Class<? extends Encoder> clazz = null;
try {
clazz = lookupEncoder(PACKAGE_CONTAINIG_ENCODERS+name);
} catch (ClassNotFoundException e) {
try {
clazz = lookupEncoder(name);
} catch (ClassNotFoundException cnfe) {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Unknown encoder: "+name +" ["+registry.keySet()+"]" );
}
}
catch (ClassCastException e) {
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Not an encoder: "+name +" ["+registry.keySet()+"]" );
}
return clazz;
}
private Class<? extends Encoder> lookupEncoder(String name)
throws ClassNotFoundException {
Class<? extends Encoder> clazz = Class.forName(name).asSubclass(Encoder.class);
registry.put( name.toUpperCase(Locale.ENGLISH), clazz );
return clazz;
}
public PhoneticFilter create(TokenStream input) {
return new PhoneticFilter(input,encoder,inject);
}

View File

@ -32,10 +32,34 @@ import org.apache.lucene.analysis.Tokenizer;
*/
public class TestPhoneticFilterFactory extends BaseTokenTestCase {
private static final int REPEATS = 100000;
/**
* Case: default
*/
public void testFactory()
{
Map<String,String> args = new HashMap<String, String>();
PhoneticFilterFactory ff = new PhoneticFilterFactory();
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
ff.init( args );
assertTrue( ff.encoder instanceof Metaphone );
assertTrue( ff.inject ); // default
args.put( PhoneticFilterFactory.INJECT, "false" );
ff.init( args );
assertFalse( ff.inject );
}
/**
* Case: Failures and Exceptions
*/
public void testFactoryCaseFailure()
{
Map<String,String> args = new HashMap<String, String>();
PhoneticFilterFactory ff = new PhoneticFilterFactory();
try {
ff.init( args );
@ -48,15 +72,27 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
fail( "unknown encoder parameter" );
}
catch( Exception ex ) {}
args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.NonExistence" );
try {
ff.init( args );
fail( "unknown encoder parameter" );
}
catch( Exception ex ) {}
}
/**
* Case: Reflection
*/
public void testFactoryCaseReflection()
{
Map<String,String> args = new HashMap<String, String>();
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
PhoneticFilterFactory ff = new PhoneticFilterFactory();
args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" );
ff.init( args );
assertTrue( ff.encoder instanceof Metaphone );
assertTrue( ff.inject ); // default
args.put( PhoneticFilterFactory.INJECT, "false" );
ff.init( args );
assertFalse( ff.inject );
}
public void testAlgorithms() throws Exception {
@ -85,6 +121,12 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
"TTA1111111", "Datha", "KLN1111111", "Carlene" });
assertAlgorithm("Caverphone", "false", "Darda Karleen Datha Carlene",
new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" });
assertAlgorithm("ColognePhonetic", "true", "Meier Schmitt Meir Schmidt",
new String[] { "67", "Meier", "862", "Schmitt",
"67", "Meir", "862", "Schmidt" });
assertAlgorithm("ColognePhonetic", "false", "Meier Schmitt Meir Schmidt",
new String[] { "67", "862", "67", "862" });
}
static void assertAlgorithm(String algName, String inject, String input,
@ -98,4 +140,25 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, expected);
}
public void testSpeed() throws Exception {
checkSpeedEncoding("Metaphone", "easgasg", "ESKS");
checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS");
checkSpeedEncoding("Soundex", "easgasg", "E220");
checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034");
checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111");
checkSpeedEncoding("ColognePhonetic", "Schmitt", "862");
}
private void checkSpeedEncoding(String encoder, String toBeEncoded, String estimated) throws Exception {
long start = System.currentTimeMillis();
for ( int i=0; i<REPEATS; i++) {
assertAlgorithm(encoder, "false", toBeEncoded,
new String[] { estimated });
}
long duration = System.currentTimeMillis()-start;
if (VERBOSE)
System.out.println(encoder + " encodings per msec: "+(REPEATS/duration));
}
}

View File

@ -1,2 +0,0 @@
AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
Apache SVN contains full history.