mirror of https://github.com/apache/lucene.git
SOLR-3424 PhoneticFilterFactory thread-safety bug. Improved documentation & tests too.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1334544 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
73dd9ff015
commit
9707be81c1
|
@ -26,8 +26,10 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Create tokens for phonetic matches. See:
|
||||
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
* Create tokens for phonetic matches.
|
||||
* @see <a href="
|
||||
* http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
* ">Apache Commons Codec</a>
|
||||
*/
|
||||
public final class PhoneticFilter extends TokenFilter
|
||||
{
|
||||
|
|
|
@ -18,11 +18,10 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.*;
|
||||
|
@ -32,14 +31,19 @@ import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
|||
/**
|
||||
* Factory for {@link PhoneticFilter}.
|
||||
*
|
||||
* Create tokens based on phonetic encoders
|
||||
*
|
||||
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
*
|
||||
* This takes two arguments:
|
||||
* "encoder" required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex"
|
||||
*
|
||||
* "inject" (default=true) add tokens to the stream with the offset=0
|
||||
* Create tokens based on phonetic encoders from <a href="
|
||||
* http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
* ">Apache Commons Codec</a>.
|
||||
* <p>
|
||||
* This takes one required argument, "encoder", and the rest are optional:
|
||||
* <dl>
|
||||
* <dt>encoder<dd> required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0),
|
||||
* or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by
|
||||
* itself if it already contains a '.' or otherwise as in the same package as these others.
|
||||
* <dt>inject<dd> (default=true) add tokens to the stream with the offset=0
|
||||
* <dt>maxCodeLength<dd>The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't
|
||||
* support this then specifying this is an error.
|
||||
* </dl>
|
||||
*
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100">
|
||||
|
@ -49,29 +53,32 @@ import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
|||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
*
|
||||
* @see PhoneticFilter
|
||||
*/
|
||||
public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
||||
{
|
||||
public static final String ENCODER = "encoder";
|
||||
public static final String INJECT = "inject"; // boolean
|
||||
public static final String MAX_CODE_LENGTH = "maxCodeLength";
|
||||
private static final String PACKAGE_CONTAINING_ENCODERS = "org.apache.commons.codec.language.";
|
||||
|
||||
private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>()
|
||||
{{
|
||||
put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class );
|
||||
put( "Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class );
|
||||
put( "Soundex".toUpperCase(Locale.ENGLISH), Soundex.class );
|
||||
put( "RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class );
|
||||
put( "Caverphone".toUpperCase(Locale.ENGLISH), Caverphone2.class );
|
||||
put( "ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class );
|
||||
}};
|
||||
private static final Lock lock = new ReentrantLock();
|
||||
//Effectively constants; uppercase keys
|
||||
private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>(6);
|
||||
|
||||
static {
|
||||
registry.put("DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class);
|
||||
registry.put("Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class);
|
||||
registry.put("Soundex".toUpperCase(Locale.ENGLISH), Soundex.class);
|
||||
registry.put("RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class);
|
||||
registry.put("Caverphone".toUpperCase(Locale.ENGLISH), Caverphone2.class);
|
||||
registry.put("ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class);
|
||||
}
|
||||
|
||||
protected boolean inject = true;
|
||||
protected String name = null;
|
||||
protected Encoder encoder = null;
|
||||
protected Class<? extends Encoder> clazz = null;
|
||||
protected Method setMaxCodeLenMethod = null;
|
||||
protected Integer maxCodeLength = null;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
|
@ -84,56 +91,57 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
|||
throw new InitializationException("Missing required parameter: " + ENCODER
|
||||
+ " [" + registry.keySet() + "]");
|
||||
}
|
||||
Class<? extends Encoder> clazz = registry.get(name.toUpperCase(Locale.ENGLISH));
|
||||
clazz = registry.get(name.toUpperCase(Locale.ENGLISH));
|
||||
if( clazz == null ) {
|
||||
lock.lock();
|
||||
try {
|
||||
clazz = resolveEncoder(name);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
String v = args.get(MAX_CODE_LENGTH);
|
||||
if (v != null) {
|
||||
maxCodeLength = Integer.valueOf(v);
|
||||
try {
|
||||
encoder = clazz.newInstance();
|
||||
setMaxCodeLenMethod = clazz.getMethod("setMaxCodeLen", int.class);
|
||||
} catch (Exception e) {
|
||||
throw new InitializationException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e);
|
||||
}
|
||||
}
|
||||
|
||||
// Try to set the maxCodeLength
|
||||
String v = args.get( "maxCodeLength" );
|
||||
if( v != null ) {
|
||||
Method setter = encoder.getClass().getMethod( "setMaxCodeLen", int.class );
|
||||
setter.invoke( encoder, Integer.parseInt( v ) );
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new InitializationException("Error initializing: " + name + "/" + clazz, e);
|
||||
}
|
||||
getEncoder();//trigger initialization for potential problems to be thrown now
|
||||
}
|
||||
|
||||
private Class<? extends Encoder> resolveEncoder(String name) {
|
||||
Class<? extends Encoder> clazz = null;
|
||||
String lookupName = name;
|
||||
if (name.indexOf('.') == -1) {
|
||||
lookupName = PACKAGE_CONTAINING_ENCODERS + name;
|
||||
}
|
||||
try {
|
||||
clazz = lookupEncoder(PACKAGE_CONTAINING_ENCODERS+name);
|
||||
} catch (ClassNotFoundException e) {
|
||||
try {
|
||||
clazz = lookupEncoder(name);
|
||||
return Class.forName(lookupName).asSubclass(Encoder.class);
|
||||
} catch (ClassNotFoundException cnfe) {
|
||||
throw new InitializationException("Unknown encoder: " + name + " [" + registry.keySet() + "]");
|
||||
throw new InitializationException("Unknown encoder: " + name + " must be full class name or one of " + registry.keySet(), cnfe);
|
||||
} catch (ClassCastException e) {
|
||||
throw new InitializationException("Not an encoder: " + name + " must be full class name or one of " + registry.keySet(), e);
|
||||
}
|
||||
}
|
||||
catch (ClassCastException e) {
|
||||
throw new InitializationException("Not an encoder: " + name + " [" + registry.keySet() + "]");
|
||||
}
|
||||
return clazz;
|
||||
}
|
||||
|
||||
private Class<? extends Encoder> lookupEncoder(String name)
|
||||
throws ClassNotFoundException {
|
||||
Class<? extends Encoder> clazz = Class.forName(name).asSubclass(Encoder.class);
|
||||
registry.put( name.toUpperCase(Locale.ENGLISH), clazz );
|
||||
return clazz;
|
||||
/** Must be thread-safe. */
|
||||
protected Encoder getEncoder() {
|
||||
// Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate
|
||||
// every time. A simple benchmark showed this as negligible.
|
||||
try {
|
||||
Encoder encoder = clazz.newInstance();
|
||||
// Try to set the maxCodeLength
|
||||
if(maxCodeLength != null && setMaxCodeLenMethod != null) {
|
||||
setMaxCodeLenMethod.invoke(encoder, maxCodeLength);
|
||||
}
|
||||
return encoder;
|
||||
} catch (Exception e) {
|
||||
final Throwable t = (e instanceof InvocationTargetException) ? e.getCause() : e;
|
||||
throw new InitializationException("Error initializing encoder: " + name + " / " + clazz, t);
|
||||
}
|
||||
}
|
||||
|
||||
public PhoneticFilter create(TokenStream input) {
|
||||
return new PhoneticFilter(input,encoder,inject);
|
||||
return new PhoneticFilter(input, getEncoder(), inject);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.Caverphone2;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -45,12 +46,16 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
|||
|
||||
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
|
||||
ff.init( args );
|
||||
assertTrue( ff.encoder instanceof Metaphone );
|
||||
assertTrue( ff.getEncoder() instanceof Metaphone );
|
||||
assertTrue( ff.inject ); // default
|
||||
|
||||
args.put( PhoneticFilterFactory.INJECT, "false" );
|
||||
ff.init( args );
|
||||
assertFalse( ff.inject );
|
||||
|
||||
args.put( PhoneticFilterFactory.MAX_CODE_LENGTH, "2");
|
||||
ff.init( args );
|
||||
assertEquals(2,((Metaphone) ff.getEncoder()).getMaxCodeLen());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -91,7 +96,20 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
|
|||
|
||||
args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" );
|
||||
ff.init( args );
|
||||
assertTrue( ff.encoder instanceof Metaphone );
|
||||
assertTrue( ff.getEncoder() instanceof Metaphone );
|
||||
assertTrue( ff.inject ); // default
|
||||
|
||||
// we use "Caverphone2" as it is registered in the REGISTRY as Caverphone,
|
||||
// so this effectively tests reflection without package name
|
||||
args.put( PhoneticFilterFactory.ENCODER, "Caverphone2" );
|
||||
ff.init( args );
|
||||
assertTrue( ff.getEncoder() instanceof Caverphone2 );
|
||||
assertTrue( ff.inject ); // default
|
||||
|
||||
// cross check with registry
|
||||
args.put( PhoneticFilterFactory.ENCODER, "Caverphone" );
|
||||
ff.init( args );
|
||||
assertTrue( ff.getEncoder() instanceof Caverphone2 );
|
||||
assertTrue( ff.inject ); // default
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue