SOLR-3424 PhoneticFilterFactory thread-safety bug. Improved documentation & tests too.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1334544 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2012-05-06 02:23:38 +00:00
parent 73dd9ff015
commit 9707be81c1
3 changed files with 95 additions and 67 deletions

View File

@ -26,8 +26,10 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException; import java.io.IOException;
/** /**
* Create tokens for phonetic matches. See: * Create tokens for phonetic matches.
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html * @see <a href="
* http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
* ">Apache Commons Codec</a>
*/ */
public final class PhoneticFilter extends TokenFilter public final class PhoneticFilter extends TokenFilter
{ {

View File

@ -18,11 +18,10 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.lang.reflect.Method; import java.lang.reflect.Method;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.codec.Encoder; import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.*; import org.apache.commons.codec.language.*;
@ -32,14 +31,19 @@ import org.apache.lucene.analysis.phonetic.PhoneticFilter;
/** /**
* Factory for {@link PhoneticFilter}. * Factory for {@link PhoneticFilter}.
* *
* Create tokens based on phonetic encoders * Create tokens based on phonetic encoders from <a href="
* * http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html * ">Apache Commons Codec</a>.
* * <p>
* This takes two arguments: * This takes one required argument, "encoder", and the rest are optional:
* "encoder" required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex" * <dl>
* * <dt>encoder<dd> required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0),
* "inject" (default=true) add tokens to the stream with the offset=0 * or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by
* itself if it already contains a '.' or otherwise as in the same package as these others.
* <dt>inject<dd> (default=true) add tokens to the stream with the offset=0
* <dt>maxCodeLength<dd>The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't
* support this then specifying this is an error.
* </dl>
* *
* <pre class="prettyprint" > * <pre class="prettyprint" >
* &lt;fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100"&gt; * &lt;fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100"&gt;
@ -49,29 +53,32 @@ import org.apache.lucene.analysis.phonetic.PhoneticFilter;
* &lt;/analyzer&gt; * &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
* *
*
* @see PhoneticFilter * @see PhoneticFilter
*/ */
public class PhoneticFilterFactory extends BaseTokenFilterFactory public class PhoneticFilterFactory extends BaseTokenFilterFactory
{ {
public static final String ENCODER = "encoder"; public static final String ENCODER = "encoder";
public static final String INJECT = "inject"; // boolean public static final String INJECT = "inject"; // boolean
public static final String MAX_CODE_LENGTH = "maxCodeLength";
private static final String PACKAGE_CONTAINING_ENCODERS = "org.apache.commons.codec.language."; private static final String PACKAGE_CONTAINING_ENCODERS = "org.apache.commons.codec.language.";
private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>() //Effectively constants; uppercase keys
{{ private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>(6);
put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class );
put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class ); static {
put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class ); registry.put("DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class);
put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class ); registry.put("Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class);
put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone2.class ); registry.put("Soundex".toUpperCase(Locale.ENGLISH), Soundex.class);
put( "ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class ); registry.put("RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class);
}}; registry.put("Caverphone".toUpperCase(Locale.ENGLISH), Caverphone2.class);
private static final Lock lock = new ReentrantLock(); registry.put("ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class);
}
protected boolean inject = true; protected boolean inject = true;
protected String name = null; protected String name = null;
protected Encoder encoder = null; protected Class<? extends Encoder> clazz = null;
protected Method setMaxCodeLenMethod = null;
protected Integer maxCodeLength = null;
@Override @Override
public void init(Map<String,String> args) { public void init(Map<String,String> args) {
@ -84,56 +91,57 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
throw new InitializationException("Missing required parameter: " + ENCODER throw new InitializationException("Missing required parameter: " + ENCODER
+ " [" + registry.keySet() + "]"); + " [" + registry.keySet() + "]");
} }
Class<? extends Encoder> clazz = registry.get(name.toUpperCase(Locale.ENGLISH)); clazz = registry.get(name.toUpperCase(Locale.ENGLISH));
if( clazz == null ) { if( clazz == null ) {
lock.lock();
try {
clazz = resolveEncoder(name); clazz = resolveEncoder(name);
} finally {
lock.unlock();
}
} }
String v = args.get(MAX_CODE_LENGTH);
if (v != null) {
maxCodeLength = Integer.valueOf(v);
try { try {
encoder = clazz.newInstance(); setMaxCodeLenMethod = clazz.getMethod("setMaxCodeLen", int.class);
} catch (Exception e) {
throw new InitializationException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e);
}
}
// Try to set the maxCodeLength getEncoder();//trigger initialization for potential problems to be thrown now
String v = args.get( "maxCodeLength" );
if( v != null ) {
Method setter = encoder.getClass().getMethod( "setMaxCodeLen", int.class );
setter.invoke( encoder, Integer.parseInt( v ) );
}
}
catch (Exception e) {
throw new InitializationException("Error initializing: " + name + "/" + clazz, e);
}
} }
private Class<? extends Encoder> resolveEncoder(String name) { private Class<? extends Encoder> resolveEncoder(String name) {
Class<? extends Encoder> clazz = null; String lookupName = name;
if (name.indexOf('.') == -1) {
lookupName = PACKAGE_CONTAINING_ENCODERS + name;
}
try { try {
clazz = lookupEncoder(PACKAGE_CONTAINING_ENCODERS+name); return Class.forName(lookupName).asSubclass(Encoder.class);
} catch (ClassNotFoundException e) {
try {
clazz = lookupEncoder(name);
} catch (ClassNotFoundException cnfe) { } catch (ClassNotFoundException cnfe) {
throw new InitializationException("Unknown encoder: " + name + " [" + registry.keySet() + "]"); throw new InitializationException("Unknown encoder: " + name + " must be full class name or one of " + registry.keySet(), cnfe);
} catch (ClassCastException e) {
throw new InitializationException("Not an encoder: " + name + " must be full class name or one of " + registry.keySet(), e);
} }
} }
catch (ClassCastException e) {
throw new InitializationException("Not an encoder: " + name + " [" + registry.keySet() + "]");
}
return clazz;
}
private Class<? extends Encoder> lookupEncoder(String name) /** Must be thread-safe. */
throws ClassNotFoundException { protected Encoder getEncoder() {
Class<? extends Encoder> clazz = Class.forName(name).asSubclass(Encoder.class); // Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate
registry.put( name.toUpperCase(Locale.ENGLISH), clazz ); // every time. A simple benchmark showed this as negligible.
return clazz; try {
Encoder encoder = clazz.newInstance();
// Try to set the maxCodeLength
if(maxCodeLength != null && setMaxCodeLenMethod != null) {
setMaxCodeLenMethod.invoke(encoder, maxCodeLength);
}
return encoder;
} catch (Exception e) {
final Throwable t = (e instanceof InvocationTargetException) ? e.getCause() : e;
throw new InitializationException("Error initializing encoder: " + name + " / " + clazz, t);
}
} }
public PhoneticFilter create(TokenStream input) { public PhoneticFilter create(TokenStream input) {
return new PhoneticFilter(input,encoder,inject); return new PhoneticFilter(input, getEncoder(), inject);
} }
} }

View File

@ -22,6 +22,7 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.Caverphone2;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
@ -45,12 +46,16 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
ff.init( args ); ff.init( args );
assertTrue( ff.encoder instanceof Metaphone ); assertTrue( ff.getEncoder() instanceof Metaphone );
assertTrue( ff.inject ); // default assertTrue( ff.inject ); // default
args.put( PhoneticFilterFactory.INJECT, "false" ); args.put( PhoneticFilterFactory.INJECT, "false" );
ff.init( args ); ff.init( args );
assertFalse( ff.inject ); assertFalse( ff.inject );
args.put( PhoneticFilterFactory.MAX_CODE_LENGTH, "2");
ff.init( args );
assertEquals(2,((Metaphone) ff.getEncoder()).getMaxCodeLen());
} }
/** /**
@ -91,7 +96,20 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" ); args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" );
ff.init( args ); ff.init( args );
assertTrue( ff.encoder instanceof Metaphone ); assertTrue( ff.getEncoder() instanceof Metaphone );
assertTrue( ff.inject ); // default
// we use "Caverphone2" as it is registered in the REGISTRY as Caverphone,
// so this effectively tests reflection without package name
args.put( PhoneticFilterFactory.ENCODER, "Caverphone2" );
ff.init( args );
assertTrue( ff.getEncoder() instanceof Caverphone2 );
assertTrue( ff.inject ); // default
// cross check with registry
args.put( PhoneticFilterFactory.ENCODER, "Caverphone" );
ff.init( args );
assertTrue( ff.getEncoder() instanceof Caverphone2 );
assertTrue( ff.inject ); // default assertTrue( ff.inject ); // default
} }