diff --git a/CHANGES.txt b/CHANGES.txt index cde4559b84a..a927e7c2903 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -184,6 +184,10 @@ New Features 30. SOLR-226: Added support for dynamic field as the destination of a copyField using glob (*) replacement. (ryan) +31. SOLR-224: Adding a PhoneticFilterFactory that uses apache commons codec + language encoders to build phonetically similar tokens. This currently + supports: DoubleMetaphone, Metaphone, Soundex, and RefinedSoundex (ryan) + Changes in runtime behavior 1. Highlighting using DisMax will only pick up terms from the main user query, not boost or filter queries (klaas). diff --git a/lib/commons-codec-1.3.jar b/lib/commons-codec-1.3.jar new file mode 100644 index 00000000000..41a0921bf9f --- /dev/null +++ b/lib/commons-codec-1.3.jar @@ -0,0 +1,2 @@ +AnyObjectId[957b6752af9a60c1bb2a4f65db0e90e5ce00f521] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/src/java/org/apache/solr/analysis/PhoneticFilter.java b/src/java/org/apache/solr/analysis/PhoneticFilter.java new file mode 100644 index 00000000000..fcbe5fb0a69 --- /dev/null +++ b/src/java/org/apache/solr/analysis/PhoneticFilter.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.commons.codec.Encoder; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; + +import java.io.IOException; + +/** + * Create tokens for phonetic matches. See: + * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html + * + * @version $Id:$ + */ +public class PhoneticFilter extends TokenFilter +{ + protected boolean inject = true; + protected Encoder encoder = null; + protected String name = null; + + protected Token save = null; + + public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) { + super(in); + this.encoder = encoder; + this.name = name; + this.inject = inject; + } + + @Override + public final Token next() throws IOException { + if( save != null ) { + Token temp = save; + save = null; + return temp; + } + + Token t = input.next(); + if( t != null ) { + String value = t.termText(); + try { + value = encoder.encode(t.termText()).toString(); + } + catch (Exception ignored) {} // just use the direct text + + Token m = new Token(value, t.startOffset(), t.endOffset(), name ); + if( inject ) { + m.setPositionIncrement(0); + save = m; + } + else { + // replace the token rather then add it too the stream + return m; + } + } + return t; + } +} diff --git a/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java b/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java new file mode 100644 index 00000000000..678e823b6c1 --- /dev/null +++ b/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.codec.Encoder; +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.RefinedSoundex; +import org.apache.commons.codec.language.Soundex; +import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.core.SolrException; + +/** + * Create tokens based on phonetic encoders + * + * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html + * + * This takes two arguments: + * "encoder" required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex" + * + * "inject" (default=true) add tokens to the stream with the offset=0 + * + * @version $Id:$ + * @see PhoneticFilter + */ +public class PhoneticFilterFactory extends BaseTokenFilterFactory +{ + public static final String ENCODER = "encoder"; + public static final String INJECT = "inject"; // boolean + + private static final Map> registry; + static { + registry = new HashMap>(); + registry.put( "DoubleMetaphone".toUpperCase(), DoubleMetaphone.class ); + registry.put( "Metaphone".toUpperCase(), Metaphone.class ); + registry.put( "Soundex".toUpperCase(), Soundex.class ); + registry.put( "RefinedSoundex".toUpperCase(), RefinedSoundex.class ); + } + + protected boolean inject = true; + protected String name = null; + protected Encoder encoder = null; + + @Override + public void init(Map args) { + super.init( args ); + + if( args.get( "inject" ) != null ) { + inject = Boolean.getBoolean( args.get( INJECT ) ); + } + + String name = args.get( ENCODER ); + if( name == null ) { + throw new SolrException( 500, "Missing required parameter: "+ENCODER + +" ["+registry.keySet()+"]" ); + } + Class clazz = registry.get(name.toUpperCase()); + if( clazz == null ) { + throw new SolrException( 500, "Unknown encoder: "+name +" ["+registry.keySet()+"]" ); + } + + try { + encoder = clazz.newInstance(); + } + catch (Exception e) { + throw new SolrException( 500, "Error initializing: "+name + "/"+clazz, e ); + } + } + + public TokenStream create(TokenStream input) { + return new PhoneticFilter(input,encoder,name,inject); + } +} diff --git a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java new file mode 100644 index 00000000000..6d65f6a78f6 --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.commons.codec.Encoder; +import org.apache.commons.codec.language.DoubleMetaphone; +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.RefinedSoundex; +import org.apache.commons.codec.language.Soundex; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + + +/** + * @version $Id:$ + */ +public class TestPhoneticFilter extends TestCase { + + public void testFactory() + { + Map args = new HashMap(); + + PhoneticFilterFactory ff = new PhoneticFilterFactory(); + try { + ff.init( args ); + fail( "missing encoder parameter" ); + } + catch( Exception ex ) {} + args.put( PhoneticFilterFactory.ENCODER, "XXX" ); + try { + ff.init( args ); + fail( "unknown encoder parameter" ); + } + catch( Exception ex ) {} + + args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); + ff.init( args ); + assertTrue( ff.encoder instanceof Metaphone ); + assertTrue( ff.inject ); // default + + args.put( PhoneticFilterFactory.INJECT, "false" ); + ff.init( args ); + assertFalse( ff.inject ); + } + + public void runner( Encoder enc, boolean inject ) throws Exception + { + String[] input = new String[] { + "aaa", "bbb", "ccc", "easgasg" + }; + + ArrayList stream = new ArrayList(); + ArrayList output = new ArrayList(); + for( String s : input ) { + stream.add( new Token( s, 0, s.length() ) ); + if( inject ) { + output.add( new Token( s, 0, s.length() ) ); + } + output.add( new Token( enc.encode(s).toString(), 0, s.length() ) ); + } + + PhoneticFilter filter = new PhoneticFilter( + new IterTokenStream(stream.iterator()), enc, "text", inject ); + + for( Token t : output ) { + Token got = filter.next(); + assertEquals( t.termText(), got.termText()); + } + assertNull( filter.next() ); // no more tokens + } + + public void testEncodes() throws Exception { + runner( new DoubleMetaphone(), true ); + runner( new Metaphone(), true ); + runner( new Soundex(), true ); + runner( new RefinedSoundex(), true ); + + runner( new DoubleMetaphone(), false ); + runner( new Metaphone(), false ); + runner( new Soundex(), false ); + runner( new RefinedSoundex(), false ); + } + + public static class IterTokenStream extends TokenStream { + Iterator toks; + public IterTokenStream(Iterator toks) { + this.toks = toks; + } + public Token next() { + if (toks.hasNext()) { + return toks.next(); + } + return null; + } + } +}