SOLR-224: Adding a PhoneticFilterFactory that uses commons codec language encoders to build phonetically similar tokens.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@537014 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2007-05-10 22:10:05 +00:00
parent 4461557efe
commit f4986af881
5 changed files with 291 additions and 0 deletions

View File

@ -184,6 +184,10 @@ New Features
30. SOLR-226: Added support for dynamic field as the destination of a 30. SOLR-226: Added support for dynamic field as the destination of a
copyField using glob (*) replacement. (ryan) copyField using glob (*) replacement. (ryan)
31. SOLR-224: Adding a PhoneticFilterFactory that uses apache commons codec
language encoders to build phonetically similar tokens. This currently
supports: DoubleMetaphone, Metaphone, Soundex, and RefinedSoundex (ryan)
Changes in runtime behavior Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main 1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas). user query, not boost or filter queries (klaas).

View File

@ -0,0 +1,2 @@
AnyObjectId[957b6752af9a60c1bb2a4f65db0e90e5ce00f521] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,76 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.commons.codec.Encoder;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
/**
* Create tokens for phonetic matches. See:
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
*
* @version $Id:$
*/
public class PhoneticFilter extends TokenFilter
{
protected boolean inject = true;
protected Encoder encoder = null;
protected String name = null;
protected Token save = null;
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
super(in);
this.encoder = encoder;
this.name = name;
this.inject = inject;
}
@Override
public final Token next() throws IOException {
if( save != null ) {
Token temp = save;
save = null;
return temp;
}
Token t = input.next();
if( t != null ) {
String value = t.termText();
try {
value = encoder.encode(t.termText()).toString();
}
catch (Exception ignored) {} // just use the direct text
Token m = new Token(value, t.startOffset(), t.endOffset(), name );
if( inject ) {
m.setPositionIncrement(0);
save = m;
}
else {
// replace the token rather then add it too the stream
return m;
}
}
return t;
}
}

View File

@ -0,0 +1,91 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.core.SolrException;
/**
* Create tokens based on phonetic encoders
*
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
*
* This takes two arguments:
* "encoder" required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex"
*
* "inject" (default=true) add tokens to the stream with the offset=0
*
* @version $Id:$
* @see PhoneticFilter
*/
public class PhoneticFilterFactory extends BaseTokenFilterFactory
{
public static final String ENCODER = "encoder";
public static final String INJECT = "inject"; // boolean
private static final Map<String, Class<? extends Encoder>> registry;
static {
registry = new HashMap<String, Class<? extends Encoder>>();
registry.put( "DoubleMetaphone".toUpperCase(), DoubleMetaphone.class );
registry.put( "Metaphone".toUpperCase(), Metaphone.class );
registry.put( "Soundex".toUpperCase(), Soundex.class );
registry.put( "RefinedSoundex".toUpperCase(), RefinedSoundex.class );
}
protected boolean inject = true;
protected String name = null;
protected Encoder encoder = null;
@Override
public void init(Map<String,String> args) {
super.init( args );
if( args.get( "inject" ) != null ) {
inject = Boolean.getBoolean( args.get( INJECT ) );
}
String name = args.get( ENCODER );
if( name == null ) {
throw new SolrException( 500, "Missing required parameter: "+ENCODER
+" ["+registry.keySet()+"]" );
}
Class<? extends Encoder> clazz = registry.get(name.toUpperCase());
if( clazz == null ) {
throw new SolrException( 500, "Unknown encoder: "+name +" ["+registry.keySet()+"]" );
}
try {
encoder = clazz.newInstance();
}
catch (Exception e) {
throw new SolrException( 500, "Error initializing: "+name + "/"+clazz, e );
}
}
public TokenStream create(TokenStream input) {
return new PhoneticFilter(input,encoder,name,inject);
}
}

View File

@ -0,0 +1,118 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* @version $Id:$
*/
public class TestPhoneticFilter extends TestCase {
public void testFactory()
{
Map<String,String> args = new HashMap<String, String>();
PhoneticFilterFactory ff = new PhoneticFilterFactory();
try {
ff.init( args );
fail( "missing encoder parameter" );
}
catch( Exception ex ) {}
args.put( PhoneticFilterFactory.ENCODER, "XXX" );
try {
ff.init( args );
fail( "unknown encoder parameter" );
}
catch( Exception ex ) {}
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
ff.init( args );
assertTrue( ff.encoder instanceof Metaphone );
assertTrue( ff.inject ); // default
args.put( PhoneticFilterFactory.INJECT, "false" );
ff.init( args );
assertFalse( ff.inject );
}
public void runner( Encoder enc, boolean inject ) throws Exception
{
String[] input = new String[] {
"aaa", "bbb", "ccc", "easgasg"
};
ArrayList<Token> stream = new ArrayList<Token>();
ArrayList<Token> output = new ArrayList<Token>();
for( String s : input ) {
stream.add( new Token( s, 0, s.length() ) );
if( inject ) {
output.add( new Token( s, 0, s.length() ) );
}
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
}
PhoneticFilter filter = new PhoneticFilter(
new IterTokenStream(stream.iterator()), enc, "text", inject );
for( Token t : output ) {
Token got = filter.next();
assertEquals( t.termText(), got.termText());
}
assertNull( filter.next() ); // no more tokens
}
public void testEncodes() throws Exception {
runner( new DoubleMetaphone(), true );
runner( new Metaphone(), true );
runner( new Soundex(), true );
runner( new RefinedSoundex(), true );
runner( new DoubleMetaphone(), false );
runner( new Metaphone(), false );
runner( new Soundex(), false );
runner( new RefinedSoundex(), false );
}
public static class IterTokenStream extends TokenStream {
Iterator<Token> toks;
public IterTokenStream(Iterator<Token> toks) {
this.toks = toks;
}
public Token next() {
if (toks.hasNext()) {
return toks.next();
}
return null;
}
}
}