mirror of https://github.com/apache/lucene.git
SOLR-2276: Support for cologne phonetic
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1195082 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f5ee5914b0
commit
d5601eb371
|
@ -87,7 +87,7 @@
|
|||
<classpathentry kind="lib" path="lucene/lib/junit-4.7.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/contrib/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8_1_1.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.5.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
||||
|
@ -98,7 +98,7 @@
|
|||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1099557.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-codec-1.5.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-csv-1.0-SNAPSHOT-r966014.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-httpclient-3.1.jar"/>
|
||||
|
|
|
@ -138,7 +138,7 @@
|
|||
<dependency>
|
||||
<groupId>commons-codec</groupId>
|
||||
<artifactId>commons-codec</artifactId>
|
||||
<version>1.4</version>
|
||||
<version>1.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-collections</groupId>
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -380,6 +380,9 @@ New Features
|
|||
|
||||
* SOLR-2818: Added before/after count response parsing support for range facets in
|
||||
SolrJ. (Bernhard Frauendienst via Martijn van Groningen)
|
||||
|
||||
* SOLR-2276: Add support for cologne phonetic to PhoneticFilterFactory.
|
||||
(Marc Pompl via rmuir)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
|
|
@ -21,9 +21,12 @@ import java.lang.reflect.Method;
|
|||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.Caverphone;
|
||||
import org.apache.commons.codec.language.ColognePhonetic;
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
|
@ -59,16 +62,18 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
|||
{
|
||||
public static final String ENCODER = "encoder";
|
||||
public static final String INJECT = "inject"; // boolean
|
||||
private static final String PACKAGE_CONTAINIG_ENCODERS = "org.apache.commons.codec.language.";
|
||||
|
||||
private static final Map<String, Class<? extends Encoder>> registry;
|
||||
static {
|
||||
registry = new HashMap<String, Class<? extends Encoder>>();
|
||||
registry.put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class );
|
||||
registry.put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class );
|
||||
registry.put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class );
|
||||
registry.put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class );
|
||||
registry.put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone.class );
|
||||
}
|
||||
private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>()
|
||||
{{
|
||||
put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class );
|
||||
put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class );
|
||||
put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class );
|
||||
put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class );
|
||||
put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone.class );
|
||||
put( "ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class );
|
||||
}};
|
||||
private static final Lock lock = new ReentrantLock();
|
||||
|
||||
protected boolean inject = true;
|
||||
protected String name = null;
|
||||
|
@ -87,7 +92,12 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
|||
}
|
||||
Class<? extends Encoder> clazz = registry.get(name.toUpperCase(Locale.ENGLISH));
|
||||
if( clazz == null ) {
|
||||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Unknown encoder: "+name +" ["+registry.keySet()+"]" );
|
||||
lock.lock();
|
||||
try {
|
||||
clazz = resolveEncoder(name);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
|
@ -105,6 +115,30 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
|||
}
|
||||
}
|
||||
|
||||
private Class<? extends Encoder> resolveEncoder(String name) {
|
||||
Class<? extends Encoder> clazz = null;
|
||||
try {
|
||||
clazz = lookupEncoder(PACKAGE_CONTAINIG_ENCODERS+name);
|
||||
} catch (ClassNotFoundException e) {
|
||||
try {
|
||||
clazz = lookupEncoder(name);
|
||||
} catch (ClassNotFoundException cnfe) {
|
||||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Unknown encoder: "+name +" ["+registry.keySet()+"]" );
|
||||
}
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Not an encoder: "+name +" ["+registry.keySet()+"]" );
|
||||
}
|
||||
return clazz;
|
||||
}
|
||||
|
||||
private Class<? extends Encoder> lookupEncoder(String name)
|
||||
throws ClassNotFoundException {
|
||||
Class<? extends Encoder> clazz = Class.forName(name).asSubclass(Encoder.class);
|
||||
registry.put( name.toUpperCase(Locale.ENGLISH), clazz );
|
||||
return clazz;
|
||||
}
|
||||
|
||||
public PhoneticFilter create(TokenStream input) {
|
||||
return new PhoneticFilter(input,encoder,inject);
|
||||
}
|
||||
|
|
|
@ -32,10 +32,34 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
*/
|
||||
public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
||||
|
||||
private static final int REPEATS = 100000;
|
||||
|
||||
/**
|
||||
* Case: default
|
||||
*/
|
||||
public void testFactory()
|
||||
{
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
|
||||
PhoneticFilterFactory ff = new PhoneticFilterFactory();
|
||||
|
||||
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
|
||||
ff.init( args );
|
||||
assertTrue( ff.encoder instanceof Metaphone );
|
||||
assertTrue( ff.inject ); // default
|
||||
|
||||
args.put( PhoneticFilterFactory.INJECT, "false" );
|
||||
ff.init( args );
|
||||
assertFalse( ff.inject );
|
||||
}
|
||||
|
||||
/**
|
||||
* Case: Failures and Exceptions
|
||||
*/
|
||||
public void testFactoryCaseFailure()
|
||||
{
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
|
||||
PhoneticFilterFactory ff = new PhoneticFilterFactory();
|
||||
try {
|
||||
ff.init( args );
|
||||
|
@ -48,15 +72,27 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
|||
fail( "unknown encoder parameter" );
|
||||
}
|
||||
catch( Exception ex ) {}
|
||||
args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.NonExistence" );
|
||||
try {
|
||||
ff.init( args );
|
||||
fail( "unknown encoder parameter" );
|
||||
}
|
||||
catch( Exception ex ) {}
|
||||
}
|
||||
|
||||
/**
|
||||
* Case: Reflection
|
||||
*/
|
||||
public void testFactoryCaseReflection()
|
||||
{
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
|
||||
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
|
||||
PhoneticFilterFactory ff = new PhoneticFilterFactory();
|
||||
|
||||
args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" );
|
||||
ff.init( args );
|
||||
assertTrue( ff.encoder instanceof Metaphone );
|
||||
assertTrue( ff.inject ); // default
|
||||
|
||||
args.put( PhoneticFilterFactory.INJECT, "false" );
|
||||
ff.init( args );
|
||||
assertFalse( ff.inject );
|
||||
}
|
||||
|
||||
public void testAlgorithms() throws Exception {
|
||||
|
@ -85,6 +121,12 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
|||
"TTA1111111", "Datha", "KLN1111111", "Carlene" });
|
||||
assertAlgorithm("Caverphone", "false", "Darda Karleen Datha Carlene",
|
||||
new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" });
|
||||
|
||||
assertAlgorithm("ColognePhonetic", "true", "Meier Schmitt Meir Schmidt",
|
||||
new String[] { "67", "Meier", "862", "Schmitt",
|
||||
"67", "Meir", "862", "Schmidt" });
|
||||
assertAlgorithm("ColognePhonetic", "false", "Meier Schmitt Meir Schmidt",
|
||||
new String[] { "67", "862", "67", "862" });
|
||||
}
|
||||
|
||||
static void assertAlgorithm(String algName, String inject, String input,
|
||||
|
@ -98,4 +140,25 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
|||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, expected);
|
||||
}
|
||||
|
||||
public void testSpeed() throws Exception {
|
||||
checkSpeedEncoding("Metaphone", "easgasg", "ESKS");
|
||||
checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS");
|
||||
checkSpeedEncoding("Soundex", "easgasg", "E220");
|
||||
checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034");
|
||||
checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111");
|
||||
checkSpeedEncoding("ColognePhonetic", "Schmitt", "862");
|
||||
}
|
||||
|
||||
private void checkSpeedEncoding(String encoder, String toBeEncoded, String estimated) throws Exception {
|
||||
long start = System.currentTimeMillis();
|
||||
for ( int i=0; i<REPEATS; i++) {
|
||||
assertAlgorithm(encoder, "false", toBeEncoded,
|
||||
new String[] { estimated });
|
||||
}
|
||||
long duration = System.currentTimeMillis()-start;
|
||||
if (VERBOSE)
|
||||
System.out.println(encoder + " encodings per msec: "+(REPEATS/duration));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
|
||||
Apache SVN contains full history.
|
Loading…
Reference in New Issue