SOLR-813: Adding DoubleMetaphone Filter and Factory

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@705903 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2008-10-18 18:38:24 +00:00
parent b82469d7c6
commit 180ff531e1
6 changed files with 332 additions and 1 deletions

View File

@ -43,7 +43,7 @@ New Features
See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html
(yonik, Noble Paul, Akshay Ukey via shalin)
3. SOLR-657: Replace deprecated calls with the non-deprecated equivalents
3. SOLR-657: Replace many deprecated calls with non-deprecated equivalents
(Lars Kotthoff via ryan)
4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir
@ -56,6 +56,10 @@ New Features
6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes
since the last commit. (Noble Paul, koji via shalin)
7. SOLR-813: Adding DoubleMetaphone Filter and Factory. Similar to the PhoneticFilter,
but this uses DoubleMetaphone specific calls (including alternate encoding)
(Todd Feak via ryan)
Optimizations
----------------------

View File

@ -247,6 +247,14 @@
/>
</analyzer>
</fieldType>
<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
</analyzer>
</fieldtype>
<!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright

View File

@ -0,0 +1,90 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
public class DoubleMetaphoneFilter extends TokenFilter {
private static final String TOKEN_TYPE = "DoubleMetaphone";
private final LinkedList<Token> remainingTokens = new LinkedList<Token>();
private final DoubleMetaphone encoder = new DoubleMetaphone();
private final boolean inject;
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
super(input);
this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject;
}
@Override
public final Token next(Token in) throws IOException {
if (!remainingTokens.isEmpty()) {
return remainingTokens.removeFirst();
}
Token t = input.next(in);
if (t != null) {
if (inject) {
remainingTokens.addLast(t);
}
boolean isPhonetic = false;
String v = new String(t.termBuffer(), 0, t.termLength());
String primaryPhoneticValue = encoder.doubleMetaphone(v);
if (primaryPhoneticValue.length() > 0) {
Token token = (Token) t.clone();
if( inject ) {
token.setPositionIncrement( 0 );
}
token.setType( TOKEN_TYPE );
token.setTermBuffer(primaryPhoneticValue);
remainingTokens.addLast(token);
isPhonetic = true;
}
String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
if (alternatePhoneticValue.length() > 0
&& !primaryPhoneticValue.equals(alternatePhoneticValue)) {
Token token = (Token) t.clone();
token.setPositionIncrement( 0 );
token.setType( TOKEN_TYPE );
token.setTermBuffer(alternatePhoneticValue);
remainingTokens.addLast(token);
isPhonetic = true;
}
// If we did not add something, then go to the next one...
if( !isPhonetic ) {
t = next(in);
if( t != null ) {
t.setPositionIncrement( t.getPositionIncrement()+1 );
}
return t;
}
}
return remainingTokens.isEmpty() ? null : remainingTokens.removeFirst();
}
}

View File

@ -0,0 +1,49 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
{
public static final String INJECT = "inject";
public static final String MAX_CODE_LENGTH = "maxCodeLength";
public static final int DEFAULT_MAX_CODE_LENGTH = 4;
private boolean inject = true;
private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
@Override
public void init(Map<String, String> args) {
super.init(args);
if (args.get(INJECT) != null) {
inject = Boolean.getBoolean(args.get(INJECT));
}
if (args.get(MAX_CODE_LENGTH) != null) {
maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH));
}
}
public DoubleMetaphoneFilter create(TokenStream input) {
return new DoubleMetaphoneFilter(input, maxCodeLength, inject);
}
}

View File

@ -0,0 +1,71 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
public class DoubleMetaphoneFilterFactoryTest extends TestCase {
public void testDefaults() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>());
TokenStream inputStream = new IterTokenStream("international");
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
Token token = filteredStream.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filteredStream.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filteredStream.next(new Token()));
}
public void testSettingSizeAndInject() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
Map<String, String> parameters = new HashMap<String, String>();
parameters.put("inject", "false");
parameters.put("maxCodeLength", "8");
factory.init(parameters);
TokenStream inputStream = new IterTokenStream("international");
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
Token token = filteredStream.next(new Token());
assertEquals(8, token.termLength());
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
.termLength()));
assertNull(filteredStream.next(new Token()));
}
}

View File

@ -0,0 +1,109 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
public class DoubleMetaphoneFilterTest extends TestCase {
public void testSize4FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
}
public void testSize4TrueInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
Token token = filter.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
}
public void testAlternateInjectFalse() throws Exception {
TokenStream stream = new IterTokenStream("Kuczewski");
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
}
public void testSize8FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
Token token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
.termLength()));
assertNull(filter.next(new Token()));
}
public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&" });
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
Token token = filter.next(new Token());
assertEquals(5, token.termLength());
assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
.termLength()));
}
public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&" });
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertNull(filter.next(new Token()));
// should have something after the stream
stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&", "hello" });
filter = new DoubleMetaphoneFilter(stream, 8, false);
assertNotNull(filter.next(new Token()));
}
}