mirror of https://github.com/apache/lucene.git
SOLR-224: Adding a PhoneticFilterFactory that uses commons codec language encoders to build phonetically similar tokens.
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@537014 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4461557efe
commit
f4986af881
|
@ -184,6 +184,10 @@ New Features
|
|||
30. SOLR-226: Added support for dynamic field as the destination of a
|
||||
copyField using glob (*) replacement. (ryan)
|
||||
|
||||
31. SOLR-224: Adding a PhoneticFilterFactory that uses apache commons codec
|
||||
language encoders to build phonetically similar tokens. This currently
|
||||
supports: DoubleMetaphone, Metaphone, Soundex, and RefinedSoundex (ryan)
|
||||
|
||||
Changes in runtime behavior
|
||||
1. Highlighting using DisMax will only pick up terms from the main
|
||||
user query, not boost or filter queries (klaas).
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[957b6752af9a60c1bb2a4f65db0e90e5ce00f521] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,76 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Create tokens for phonetic matches. See:
|
||||
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
*
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class PhoneticFilter extends TokenFilter
|
||||
{
|
||||
protected boolean inject = true;
|
||||
protected Encoder encoder = null;
|
||||
protected String name = null;
|
||||
|
||||
protected Token save = null;
|
||||
|
||||
public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
|
||||
super(in);
|
||||
this.encoder = encoder;
|
||||
this.name = name;
|
||||
this.inject = inject;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Token next() throws IOException {
|
||||
if( save != null ) {
|
||||
Token temp = save;
|
||||
save = null;
|
||||
return temp;
|
||||
}
|
||||
|
||||
Token t = input.next();
|
||||
if( t != null ) {
|
||||
String value = t.termText();
|
||||
try {
|
||||
value = encoder.encode(t.termText()).toString();
|
||||
}
|
||||
catch (Exception ignored) {} // just use the direct text
|
||||
|
||||
Token m = new Token(value, t.startOffset(), t.endOffset(), name );
|
||||
if( inject ) {
|
||||
m.setPositionIncrement(0);
|
||||
save = m;
|
||||
}
|
||||
else {
|
||||
// replace the token rather then add it too the stream
|
||||
return m;
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.core.SolrException;
|
||||
|
||||
/**
|
||||
* Create tokens based on phonetic encoders
|
||||
*
|
||||
* http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
|
||||
*
|
||||
* This takes two arguments:
|
||||
* "encoder" required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex"
|
||||
*
|
||||
* "inject" (default=true) add tokens to the stream with the offset=0
|
||||
*
|
||||
* @version $Id:$
|
||||
* @see PhoneticFilter
|
||||
*/
|
||||
public class PhoneticFilterFactory extends BaseTokenFilterFactory
|
||||
{
|
||||
public static final String ENCODER = "encoder";
|
||||
public static final String INJECT = "inject"; // boolean
|
||||
|
||||
private static final Map<String, Class<? extends Encoder>> registry;
|
||||
static {
|
||||
registry = new HashMap<String, Class<? extends Encoder>>();
|
||||
registry.put( "DoubleMetaphone".toUpperCase(), DoubleMetaphone.class );
|
||||
registry.put( "Metaphone".toUpperCase(), Metaphone.class );
|
||||
registry.put( "Soundex".toUpperCase(), Soundex.class );
|
||||
registry.put( "RefinedSoundex".toUpperCase(), RefinedSoundex.class );
|
||||
}
|
||||
|
||||
protected boolean inject = true;
|
||||
protected String name = null;
|
||||
protected Encoder encoder = null;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init( args );
|
||||
|
||||
if( args.get( "inject" ) != null ) {
|
||||
inject = Boolean.getBoolean( args.get( INJECT ) );
|
||||
}
|
||||
|
||||
String name = args.get( ENCODER );
|
||||
if( name == null ) {
|
||||
throw new SolrException( 500, "Missing required parameter: "+ENCODER
|
||||
+" ["+registry.keySet()+"]" );
|
||||
}
|
||||
Class<? extends Encoder> clazz = registry.get(name.toUpperCase());
|
||||
if( clazz == null ) {
|
||||
throw new SolrException( 500, "Unknown encoder: "+name +" ["+registry.keySet()+"]" );
|
||||
}
|
||||
|
||||
try {
|
||||
encoder = clazz.newInstance();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new SolrException( 500, "Error initializing: "+name + "/"+clazz, e );
|
||||
}
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new PhoneticFilter(input,encoder,name,inject);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestPhoneticFilter extends TestCase {
|
||||
|
||||
public void testFactory()
|
||||
{
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
|
||||
PhoneticFilterFactory ff = new PhoneticFilterFactory();
|
||||
try {
|
||||
ff.init( args );
|
||||
fail( "missing encoder parameter" );
|
||||
}
|
||||
catch( Exception ex ) {}
|
||||
args.put( PhoneticFilterFactory.ENCODER, "XXX" );
|
||||
try {
|
||||
ff.init( args );
|
||||
fail( "unknown encoder parameter" );
|
||||
}
|
||||
catch( Exception ex ) {}
|
||||
|
||||
args.put( PhoneticFilterFactory.ENCODER, "Metaphone" );
|
||||
ff.init( args );
|
||||
assertTrue( ff.encoder instanceof Metaphone );
|
||||
assertTrue( ff.inject ); // default
|
||||
|
||||
args.put( PhoneticFilterFactory.INJECT, "false" );
|
||||
ff.init( args );
|
||||
assertFalse( ff.inject );
|
||||
}
|
||||
|
||||
public void runner( Encoder enc, boolean inject ) throws Exception
|
||||
{
|
||||
String[] input = new String[] {
|
||||
"aaa", "bbb", "ccc", "easgasg"
|
||||
};
|
||||
|
||||
ArrayList<Token> stream = new ArrayList<Token>();
|
||||
ArrayList<Token> output = new ArrayList<Token>();
|
||||
for( String s : input ) {
|
||||
stream.add( new Token( s, 0, s.length() ) );
|
||||
if( inject ) {
|
||||
output.add( new Token( s, 0, s.length() ) );
|
||||
}
|
||||
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
|
||||
}
|
||||
|
||||
PhoneticFilter filter = new PhoneticFilter(
|
||||
new IterTokenStream(stream.iterator()), enc, "text", inject );
|
||||
|
||||
for( Token t : output ) {
|
||||
Token got = filter.next();
|
||||
assertEquals( t.termText(), got.termText());
|
||||
}
|
||||
assertNull( filter.next() ); // no more tokens
|
||||
}
|
||||
|
||||
public void testEncodes() throws Exception {
|
||||
runner( new DoubleMetaphone(), true );
|
||||
runner( new Metaphone(), true );
|
||||
runner( new Soundex(), true );
|
||||
runner( new RefinedSoundex(), true );
|
||||
|
||||
runner( new DoubleMetaphone(), false );
|
||||
runner( new Metaphone(), false );
|
||||
runner( new Soundex(), false );
|
||||
runner( new RefinedSoundex(), false );
|
||||
}
|
||||
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Iterator<Token> toks) {
|
||||
this.toks = toks;
|
||||
}
|
||||
public Token next() {
|
||||
if (toks.hasNext()) {
|
||||
return toks.next();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue