SOLR-3424 PhoneticFilterFactory thread-safety bug. Improved documentation & tests too.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1334544 13f79535-47bb-0310-9956-ffa450edef68
2012-05-06 02:23:38 +00:00 · 2012-05-06 02:23:38 +00:00 · 9707be81c1
parent 73dd9ff015
commit 9707be81c1
3 changed files with 95 additions and 67 deletions
--- a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java
+++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java
@ -26,8 +26,10 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import java.io.IOException;

 /**
- * Create tokens for phonetic matches.  See:
- * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
+ * Create tokens for phonetic matches.
+ * @see <a href="
+ * http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
+ * ">Apache Commons Codec</a>
 */
 public final class PhoneticFilter extends TokenFilter 
 {
--- a/solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java
@ -18,11 +18,10 @@
 package org.apache.solr.analysis;

 import java.lang.reflect.Method;
+import java.lang.reflect.InvocationTargetException;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantLock;

 import org.apache.commons.codec.Encoder;
 import org.apache.commons.codec.language.*;
@ -32,14 +31,19 @@ import org.apache.lucene.analysis.phonetic.PhoneticFilter;
 /**
 * Factory for {@link PhoneticFilter}.
 * 
- * Create tokens based on phonetic encoders
- * 
- * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
- * 
- * This takes two arguments:
- *  "encoder" required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex"
- * 
- * "inject" (default=true) add tokens to the stream with the offset=0
+ * Create tokens based on phonetic encoders from <a href="
+ * http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
+ * ">Apache Commons Codec</a>.
+ * <p>
+ * This takes one required argument, "encoder", and the rest are optional:
+ * <dl>
+ *  <dt>encoder<dd> required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0),
+ *  or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by
+ *  itself if it already contains a '.' or otherwise as in the same package as these others.
+ *  <dt>inject<dd> (default=true) add tokens to the stream with the offset=0
+ *  <dt>maxCodeLength<dd>The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't
+ *  support this then specifying this is an error.
+ * </dl>
 *
 * <pre class="prettyprint" >
 * &lt;fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100"&gt;
@ -49,29 +53,32 @@ import org.apache.lucene.analysis.phonetic.PhoneticFilter;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 * 
- *
 * @see PhoneticFilter
 */
 public class PhoneticFilterFactory extends BaseTokenFilterFactory 
 {
  public static final String ENCODER = "encoder";
  public static final String INJECT = "inject"; // boolean
+  public static final String MAX_CODE_LENGTH = "maxCodeLength";
  private static final String PACKAGE_CONTAINING_ENCODERS = "org.apache.commons.codec.language.";

-  private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>()
-  {{
-    put( "DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class );
-    put( "Metaphone".toUpperCase(Locale.ENGLISH),       Metaphone.class );
-    put( "Soundex".toUpperCase(Locale.ENGLISH),         Soundex.class );
-    put( "RefinedSoundex".toUpperCase(Locale.ENGLISH),  RefinedSoundex.class );
-    put( "Caverphone".toUpperCase(Locale.ENGLISH),      Caverphone2.class );
-    put( "ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class );
-  }};
-  private static final Lock lock = new ReentrantLock();
+  //Effectively constants; uppercase keys
+  private static final Map<String, Class<? extends Encoder>> registry = new HashMap<String, Class<? extends Encoder>>(6);
+
+  static {
+    registry.put("DoubleMetaphone".toUpperCase(Locale.ENGLISH), DoubleMetaphone.class);
+    registry.put("Metaphone".toUpperCase(Locale.ENGLISH), Metaphone.class);
+    registry.put("Soundex".toUpperCase(Locale.ENGLISH), Soundex.class);
+    registry.put("RefinedSoundex".toUpperCase(Locale.ENGLISH), RefinedSoundex.class);
+    registry.put("Caverphone".toUpperCase(Locale.ENGLISH), Caverphone2.class);
+    registry.put("ColognePhonetic".toUpperCase(Locale.ENGLISH), ColognePhonetic.class);
+  }

  protected boolean inject = true;
  protected String name = null;
-  protected Encoder encoder = null;
+  protected Class<? extends Encoder> clazz = null;
+  protected Method setMaxCodeLenMethod = null;
+  protected Integer maxCodeLength = null;

  @Override
  public void init(Map<String,String> args) {
@ -84,56 +91,57 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
      throw new InitializationException("Missing required parameter: " + ENCODER
          + " [" + registry.keySet() + "]");
    }
-    Class<? extends Encoder> clazz = registry.get(name.toUpperCase(Locale.ENGLISH));
+    clazz = registry.get(name.toUpperCase(Locale.ENGLISH));
    if( clazz == null ) {
-      lock.lock();
-      try {
      clazz = resolveEncoder(name);
-      } finally {
-        lock.unlock();
-      }
    }

+    String v = args.get(MAX_CODE_LENGTH);
+    if (v != null) {
+      maxCodeLength = Integer.valueOf(v);
      try {
-      encoder = clazz.newInstance();
+        setMaxCodeLenMethod = clazz.getMethod("setMaxCodeLen", int.class);
+      } catch (Exception e) {
+        throw new InitializationException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e);
+      }
+    }

-      // Try to set the maxCodeLength
-      String v = args.get( "maxCodeLength" );
-      if( v != null ) {
-        Method setter = encoder.getClass().getMethod( "setMaxCodeLen", int.class );
-        setter.invoke( encoder, Integer.parseInt( v ) );
-      }
-    } 
-    catch (Exception e) {
-      throw new InitializationException("Error initializing: " + name + "/" + clazz, e);
-    }
+    getEncoder();//trigger initialization for potential problems to be thrown now
  }

  private Class<? extends Encoder> resolveEncoder(String name) {
-    Class<? extends Encoder> clazz = null;
+    String lookupName = name;
+    if (name.indexOf('.') == -1) {
+      lookupName = PACKAGE_CONTAINING_ENCODERS + name;
+    }
    try {
-      clazz = lookupEncoder(PACKAGE_CONTAINING_ENCODERS+name);
-    } catch (ClassNotFoundException e) {
-      try {
-        clazz = lookupEncoder(name);
+      return Class.forName(lookupName).asSubclass(Encoder.class);
    } catch (ClassNotFoundException cnfe) {
-        throw new InitializationException("Unknown encoder: " + name + " [" + registry.keySet() + "]");
+      throw new InitializationException("Unknown encoder: " + name + " must be full class name or one of " + registry.keySet(), cnfe);
+    } catch (ClassCastException e) {
+      throw new InitializationException("Not an encoder: " + name + " must be full class name or one of " + registry.keySet(), e);
    }
  }
-    catch (ClassCastException e) {
-      throw new InitializationException("Not an encoder: " + name + " [" + registry.keySet() + "]");
-    }
-    return clazz;
-  }

-  private Class<? extends Encoder> lookupEncoder(String name)
-      throws ClassNotFoundException {
-    Class<? extends Encoder> clazz = Class.forName(name).asSubclass(Encoder.class);
-    registry.put( name.toUpperCase(Locale.ENGLISH), clazz );
-    return clazz;
+  /** Must be thread-safe. */
+  protected Encoder getEncoder() {
+    // Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate
+    // every time.  A simple benchmark showed this as negligible.
+    try {
+      Encoder encoder = clazz.newInstance();
+      // Try to set the maxCodeLength
+      if(maxCodeLength != null && setMaxCodeLenMethod != null) {
+        setMaxCodeLenMethod.invoke(encoder, maxCodeLength);
+      }
+      return encoder;
+    } catch (Exception e) {
+      final Throwable t = (e instanceof InvocationTargetException) ? e.getCause() : e;
+      throw new InitializationException("Error initializing encoder: " + name + " / " + clazz, t);
+    }
  }

  public PhoneticFilter create(TokenStream input) {
-    return new PhoneticFilter(input,encoder,inject);
+    return new PhoneticFilter(input, getEncoder(), inject);
  }
+
 }
--- a/solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java
@ -22,6 +22,7 @@ import java.util.HashMap;
 import java.util.Map;

 import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.Caverphone2;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@ -45,12 +46,16 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {
    
    args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
    ff.init( args );
-    assertTrue( ff.encoder instanceof Metaphone );
+    assertTrue( ff.getEncoder() instanceof Metaphone );
    assertTrue( ff.inject ); // default

    args.put( PhoneticFilterFactory.INJECT, "false" );
    ff.init( args );
    assertFalse( ff.inject );
+
+    args.put( PhoneticFilterFactory.MAX_CODE_LENGTH, "2");
+    ff.init( args );
+    assertEquals(2,((Metaphone) ff.getEncoder()).getMaxCodeLen());
  }
  
  /**
@ -91,7 +96,20 @@ public class TestPhoneticFilterFactory extends BaseTokenTestCase {

    args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" );
    ff.init( args );
-    assertTrue( ff.encoder instanceof Metaphone );
+    assertTrue( ff.getEncoder() instanceof Metaphone );
+    assertTrue( ff.inject ); // default
+
+    // we use "Caverphone2" as it is registered in the REGISTRY as Caverphone,
+    // so this effectively tests reflection without package name
+    args.put( PhoneticFilterFactory.ENCODER, "Caverphone2" );
+    ff.init( args );
+    assertTrue( ff.getEncoder() instanceof Caverphone2 );
+    assertTrue( ff.inject ); // default
+    
+    // cross check with registry
+    args.put( PhoneticFilterFactory.ENCODER, "Caverphone" );
+    ff.init( args );
+    assertTrue( ff.getEncoder() instanceof Caverphone2 );
    assertTrue( ff.inject ); // default
  }