mirror of https://github.com/apache/lucene.git
SOLR-813: Adding DoubleMetaphone Filter and Factory
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@705903 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b82469d7c6
commit
180ff531e1
|
@ -43,7 +43,7 @@ New Features
|
|||
See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html
|
||||
(yonik, Noble Paul, Akshay Ukey via shalin)
|
||||
|
||||
3. SOLR-657: Replace deprecated calls with the non-deprecated equivalents
|
||||
3. SOLR-657: Replace many deprecated calls with non-deprecated equivalents
|
||||
(Lars Kotthoff via ryan)
|
||||
|
||||
4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir
|
||||
|
@ -56,6 +56,10 @@ New Features
|
|||
6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes
|
||||
since the last commit. (Noble Paul, koji via shalin)
|
||||
|
||||
7. SOLR-813: Adding DoubleMetaphone Filter and Factory. Similar to the PhoneticFilter,
|
||||
but this uses DoubleMetaphone specific calls (including alternate encoding)
|
||||
(Todd Feak via ryan)
|
||||
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
|
|
@ -247,6 +247,14 @@
|
|||
/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed, any data added to
|
||||
them will be ignored outright
|
||||
|
|
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class DoubleMetaphoneFilter extends TokenFilter {
|
||||
|
||||
private static final String TOKEN_TYPE = "DoubleMetaphone";
|
||||
|
||||
private final LinkedList<Token> remainingTokens = new LinkedList<Token>();
|
||||
private final DoubleMetaphone encoder = new DoubleMetaphone();
|
||||
private final boolean inject;
|
||||
|
||||
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
|
||||
super(input);
|
||||
this.encoder.setMaxCodeLen(maxCodeLength);
|
||||
this.inject = inject;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Token next(Token in) throws IOException {
|
||||
if (!remainingTokens.isEmpty()) {
|
||||
return remainingTokens.removeFirst();
|
||||
}
|
||||
|
||||
Token t = input.next(in);
|
||||
if (t != null) {
|
||||
if (inject) {
|
||||
remainingTokens.addLast(t);
|
||||
}
|
||||
|
||||
boolean isPhonetic = false;
|
||||
String v = new String(t.termBuffer(), 0, t.termLength());
|
||||
String primaryPhoneticValue = encoder.doubleMetaphone(v);
|
||||
if (primaryPhoneticValue.length() > 0) {
|
||||
Token token = (Token) t.clone();
|
||||
if( inject ) {
|
||||
token.setPositionIncrement( 0 );
|
||||
}
|
||||
token.setType( TOKEN_TYPE );
|
||||
token.setTermBuffer(primaryPhoneticValue);
|
||||
remainingTokens.addLast(token);
|
||||
isPhonetic = true;
|
||||
}
|
||||
|
||||
String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
|
||||
if (alternatePhoneticValue.length() > 0
|
||||
&& !primaryPhoneticValue.equals(alternatePhoneticValue)) {
|
||||
Token token = (Token) t.clone();
|
||||
token.setPositionIncrement( 0 );
|
||||
token.setType( TOKEN_TYPE );
|
||||
token.setTermBuffer(alternatePhoneticValue);
|
||||
remainingTokens.addLast(token);
|
||||
isPhonetic = true;
|
||||
}
|
||||
|
||||
// If we did not add something, then go to the next one...
|
||||
if( !isPhonetic ) {
|
||||
t = next(in);
|
||||
if( t != null ) {
|
||||
t.setPositionIncrement( t.getPositionIncrement()+1 );
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
return remainingTokens.isEmpty() ? null : remainingTokens.removeFirst();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
|
||||
{
|
||||
public static final String INJECT = "inject";
|
||||
public static final String MAX_CODE_LENGTH = "maxCodeLength";
|
||||
|
||||
public static final int DEFAULT_MAX_CODE_LENGTH = 4;
|
||||
|
||||
private boolean inject = true;
|
||||
private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
|
||||
if (args.get(INJECT) != null) {
|
||||
inject = Boolean.getBoolean(args.get(INJECT));
|
||||
}
|
||||
|
||||
if (args.get(MAX_CODE_LENGTH) != null) {
|
||||
maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH));
|
||||
}
|
||||
}
|
||||
|
||||
public DoubleMetaphoneFilter create(TokenStream input) {
|
||||
return new DoubleMetaphoneFilter(input, maxCodeLength, inject);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
|
||||
|
||||
public class DoubleMetaphoneFilterFactoryTest extends TestCase {
|
||||
|
||||
public void testDefaults() throws Exception {
|
||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||
factory.init(new HashMap<String, String>());
|
||||
TokenStream inputStream = new IterTokenStream("international");
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
|
||||
Token token = filteredStream.next(new Token());
|
||||
assertEquals(13, token.termLength());
|
||||
assertEquals("international", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
|
||||
token = filteredStream.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
assertNull(filteredStream.next(new Token()));
|
||||
}
|
||||
|
||||
public void testSettingSizeAndInject() throws Exception {
|
||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||
Map<String, String> parameters = new HashMap<String, String>();
|
||||
parameters.put("inject", "false");
|
||||
parameters.put("maxCodeLength", "8");
|
||||
factory.init(parameters);
|
||||
|
||||
TokenStream inputStream = new IterTokenStream("international");
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
|
||||
Token token = filteredStream.next(new Token());
|
||||
assertEquals(8, token.termLength());
|
||||
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
|
||||
assertNull(filteredStream.next(new Token()));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,109 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
|
||||
|
||||
public class DoubleMetaphoneFilterTest extends TestCase {
|
||||
|
||||
public void testSize4FalseInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("international");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
assertNull(filter.next(new Token()));
|
||||
}
|
||||
|
||||
public void testSize4TrueInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("international");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(13, token.termLength());
|
||||
assertEquals("international", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
|
||||
token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
assertNull(filter.next(new Token()));
|
||||
}
|
||||
|
||||
public void testAlternateInjectFalse() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("Kuczewski");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertNull(filter.next(new Token()));
|
||||
}
|
||||
|
||||
public void testSize8FalseInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("international");
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(8, token.termLength());
|
||||
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
|
||||
assertNull(filter.next(new Token()));
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream(
|
||||
new String[] { "12345", "#$%@#^%&" });
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(5, token.termLength());
|
||||
assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
token = filter.next(new Token());
|
||||
assertEquals(8, token.termLength());
|
||||
assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream(
|
||||
new String[] { "12345", "#$%@#^%&" });
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
|
||||
assertNull(filter.next(new Token()));
|
||||
|
||||
// should have something after the stream
|
||||
stream = new IterTokenStream(
|
||||
new String[] { "12345", "#$%@#^%&", "hello" });
|
||||
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertNotNull(filter.next(new Token()));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue