SOLR-224: Adding a PhoneticFilterFactory that uses commons codec language encoders to build phonetically similar tokens.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@537014 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2007-05-10 22:10:05 +00:00
parent 4461557efe
commit f4986af881
5 changed files with 291 additions and 0 deletions

View File

@ -184,6 +184,10 @@ New Features
30. SOLR-226: Added support for dynamic field as the destination of a
copyField using glob (*) replacement. (ryan)
31. SOLR-224: Adding a PhoneticFilterFactory that uses apache commons codec
language encoders to build phonetically similar tokens. This currently
supports: DoubleMetaphone, Metaphone, Soundex, and RefinedSoundex (ryan)
Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas).

View File

@ -0,0 +1,2 @@
AnyObjectId[957b6752af9a60c1bb2a4f65db0e90e5ce00f521] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,76 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.commons.codec.Encoder;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
/**
* Create tokens for phonetic matches. See:
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
*
* @version $Id:$
*/
public class PhoneticFilter extends TokenFilter
{
protected boolean inject = true;
protected Encoder encoder = null;
protected String name = null;
protected Token save = null;
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
super(in);
this.encoder = encoder;
this.name = name;
this.inject = inject;
}
@Override
public final Token next() throws IOException {
if( save != null ) {
Token temp = save;
save = null;
return temp;
}
Token t = input.next();
if( t != null ) {
String value = t.termText();
try {
value = encoder.encode(t.termText()).toString();
}
catch (Exception ignored) {} // just use the direct text
Token m = new Token(value, t.startOffset(), t.endOffset(), name );
if( inject ) {
m.setPositionIncrement(0);
save = m;
}
else {
// replace the token rather then add it too the stream
return m;
}
}
return t;
}
}

View File

@ -0,0 +1,91 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.core.SolrException;
/**
* Create tokens based on phonetic encoders
*
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
*
* This takes two arguments:
* "encoder" required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex"
*
* "inject" (default=true) add tokens to the stream with the offset=0
*
* @version $Id:$
* @see PhoneticFilter
*/
public class PhoneticFilterFactory extends BaseTokenFilterFactory
{
public static final String ENCODER = "encoder";
public static final String INJECT = "inject"; // boolean
private static final Map<String, Class<? extends Encoder>> registry;
static {
registry = new HashMap<String, Class<? extends Encoder>>();
registry.put( "DoubleMetaphone".toUpperCase(), DoubleMetaphone.class );
registry.put( "Metaphone".toUpperCase(), Metaphone.class );
registry.put( "Soundex".toUpperCase(), Soundex.class );
registry.put( "RefinedSoundex".toUpperCase(), RefinedSoundex.class );
}
protected boolean inject = true;
protected String name = null;
protected Encoder encoder = null;
@Override
public void init(Map<String,String> args) {
super.init( args );
if( args.get( "inject" ) != null ) {
inject = Boolean.getBoolean( args.get( INJECT ) );
}
String name = args.get( ENCODER );
if( name == null ) {
throw new SolrException( 500, "Missing required parameter: "+ENCODER
+" ["+registry.keySet()+"]" );
}
Class<? extends Encoder> clazz = registry.get(name.toUpperCase());
if( clazz == null ) {
throw new SolrException( 500, "Unknown encoder: "+name +" ["+registry.keySet()+"]" );
}
try {
encoder = clazz.newInstance();
}
catch (Exception e) {
throw new SolrException( 500, "Error initializing: "+name + "/"+clazz, e );
}
}
public TokenStream create(TokenStream input) {
return new PhoneticFilter(input,encoder,name,inject);
}
}

View File

@ -0,0 +1,118 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* @version $Id:$
*/
public class TestPhoneticFilter extends TestCase {
public void testFactory()
{
Map<String,String> args = new HashMap<String, String>();
PhoneticFilterFactory ff = new PhoneticFilterFactory();
try {
ff.init( args );
fail( "missing encoder parameter" );
}
catch( Exception ex ) {}
args.put( PhoneticFilterFactory.ENCODER, "XXX" );
try {
ff.init( args );
fail( "unknown encoder parameter" );
}
catch( Exception ex ) {}
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
ff.init( args );
assertTrue( ff.encoder instanceof Metaphone );
assertTrue( ff.inject ); // default
args.put( PhoneticFilterFactory.INJECT, "false" );
ff.init( args );
assertFalse( ff.inject );
}
public void runner( Encoder enc, boolean inject ) throws Exception
{
String[] input = new String[] {
"aaa", "bbb", "ccc", "easgasg"
};
ArrayList<Token> stream = new ArrayList<Token>();
ArrayList<Token> output = new ArrayList<Token>();
for( String s : input ) {
stream.add( new Token( s, 0, s.length() ) );
if( inject ) {
output.add( new Token( s, 0, s.length() ) );
}
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
}
PhoneticFilter filter = new PhoneticFilter(
new IterTokenStream(stream.iterator()), enc, "text", inject );
for( Token t : output ) {
Token got = filter.next();
assertEquals( t.termText(), got.termText());
}
assertNull( filter.next() ); // no more tokens
}
public void testEncodes() throws Exception {
runner( new DoubleMetaphone(), true );
runner( new Metaphone(), true );
runner( new Soundex(), true );
runner( new RefinedSoundex(), true );
runner( new DoubleMetaphone(), false );
runner( new Metaphone(), false );
runner( new Soundex(), false );
runner( new RefinedSoundex(), false );
}
public static class IterTokenStream extends TokenStream {
Iterator<Token> toks;
public IterTokenStream(Iterator<Token> toks) {
this.toks = toks;
}
public Token next() {
if (toks.hasNext()) {
return toks.next();
}
return null;
}
}
}