SOLR-813: Adding DoubleMetaphone Filter and Factory

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@705903 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2008-10-18 18:38:24 +00:00
parent b82469d7c6
commit 180ff531e1
6 changed files with 332 additions and 1 deletions

View File

@ -43,7 +43,7 @@ New Features
See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html
(yonik, Noble Paul, Akshay Ukey via shalin) (yonik, Noble Paul, Akshay Ukey via shalin)
3. SOLR-657: Replace deprecated calls with the non-deprecated equivalents 3. SOLR-657: Replace many deprecated calls with non-deprecated equivalents
(Lars Kotthoff via ryan) (Lars Kotthoff via ryan)
4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir 4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir
@ -56,6 +56,10 @@ New Features
6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes 6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes
since the last commit. (Noble Paul, koji via shalin) since the last commit. (Noble Paul, koji via shalin)
7. SOLR-813: Adding DoubleMetaphone Filter and Factory. Similar to the PhoneticFilter,
but this uses DoubleMetaphone specific calls (including alternate encoding)
(Todd Feak via ryan)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -247,6 +247,14 @@
/> />
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
</analyzer>
</fieldtype>
<!-- since fields of this type are by default not stored or indexed, any data added to <!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright them will be ignored outright

View File

@ -0,0 +1,90 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
public class DoubleMetaphoneFilter extends TokenFilter {
private static final String TOKEN_TYPE = "DoubleMetaphone";
private final LinkedList<Token> remainingTokens = new LinkedList<Token>();
private final DoubleMetaphone encoder = new DoubleMetaphone();
private final boolean inject;
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
super(input);
this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject;
}
@Override
public final Token next(Token in) throws IOException {
if (!remainingTokens.isEmpty()) {
return remainingTokens.removeFirst();
}
Token t = input.next(in);
if (t != null) {
if (inject) {
remainingTokens.addLast(t);
}
boolean isPhonetic = false;
String v = new String(t.termBuffer(), 0, t.termLength());
String primaryPhoneticValue = encoder.doubleMetaphone(v);
if (primaryPhoneticValue.length() > 0) {
Token token = (Token) t.clone();
if( inject ) {
token.setPositionIncrement( 0 );
}
token.setType( TOKEN_TYPE );
token.setTermBuffer(primaryPhoneticValue);
remainingTokens.addLast(token);
isPhonetic = true;
}
String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
if (alternatePhoneticValue.length() > 0
&& !primaryPhoneticValue.equals(alternatePhoneticValue)) {
Token token = (Token) t.clone();
token.setPositionIncrement( 0 );
token.setType( TOKEN_TYPE );
token.setTermBuffer(alternatePhoneticValue);
remainingTokens.addLast(token);
isPhonetic = true;
}
// If we did not add something, then go to the next one...
if( !isPhonetic ) {
t = next(in);
if( t != null ) {
t.setPositionIncrement( t.getPositionIncrement()+1 );
}
return t;
}
}
return remainingTokens.isEmpty() ? null : remainingTokens.removeFirst();
}
}

View File

@ -0,0 +1,49 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
{
public static final String INJECT = "inject";
public static final String MAX_CODE_LENGTH = "maxCodeLength";
public static final int DEFAULT_MAX_CODE_LENGTH = 4;
private boolean inject = true;
private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
@Override
public void init(Map<String, String> args) {
super.init(args);
if (args.get(INJECT) != null) {
inject = Boolean.getBoolean(args.get(INJECT));
}
if (args.get(MAX_CODE_LENGTH) != null) {
maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH));
}
}
public DoubleMetaphoneFilter create(TokenStream input) {
return new DoubleMetaphoneFilter(input, maxCodeLength, inject);
}
}

View File

@ -0,0 +1,71 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
public class DoubleMetaphoneFilterFactoryTest extends TestCase {
public void testDefaults() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>());
TokenStream inputStream = new IterTokenStream("international");
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
Token token = filteredStream.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filteredStream.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filteredStream.next(new Token()));
}
public void testSettingSizeAndInject() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
Map<String, String> parameters = new HashMap<String, String>();
parameters.put("inject", "false");
parameters.put("maxCodeLength", "8");
factory.init(parameters);
TokenStream inputStream = new IterTokenStream("international");
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
Token token = filteredStream.next(new Token());
assertEquals(8, token.termLength());
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
.termLength()));
assertNull(filteredStream.next(new Token()));
}
}

View File

@ -0,0 +1,109 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
public class DoubleMetaphoneFilterTest extends TestCase {
public void testSize4FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
}
public void testSize4TrueInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
Token token = filter.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
}
public void testAlternateInjectFalse() throws Exception {
TokenStream stream = new IterTokenStream("Kuczewski");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
}
public void testSize8FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
Token token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
.termLength()));
assertNull(filter.next(new Token()));
}
public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&" });
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
Token token = filter.next(new Token());
assertEquals(5, token.termLength());
assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
.termLength()));
}
public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&" });
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertNull(filter.next(new Token()));
// should have something after the stream
stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&", "hello" });
filter = new DoubleMetaphoneFilter(stream, 8, false);
assertNotNull(filter.next(new Token()));
}
}