mirror of https://github.com/apache/lucene.git
SOLR-813: Adding DoubleMetaphone Filter and Factory
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@705903 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b82469d7c6
commit
180ff531e1
|
@ -43,7 +43,7 @@ New Features
|
||||||
See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html
|
See http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html
|
||||||
(yonik, Noble Paul, Akshay Ukey via shalin)
|
(yonik, Noble Paul, Akshay Ukey via shalin)
|
||||||
|
|
||||||
3. SOLR-657: Replace deprecated calls with the non-deprecated equivalents
|
3. SOLR-657: Replace many deprecated calls with non-deprecated equivalents
|
||||||
(Lars Kotthoff via ryan)
|
(Lars Kotthoff via ryan)
|
||||||
|
|
||||||
4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir
|
4. SOLR-658: Allow Solr to load index from arbitrary directory in dataDir
|
||||||
|
@ -56,6 +56,10 @@ New Features
|
||||||
6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes
|
6. SOLR-670: Add support for rollbacks in UpdateHandler. This allows user to rollback all changes
|
||||||
since the last commit. (Noble Paul, koji via shalin)
|
since the last commit. (Noble Paul, koji via shalin)
|
||||||
|
|
||||||
|
7. SOLR-813: Adding DoubleMetaphone Filter and Factory. Similar to the PhoneticFilter,
|
||||||
|
but this uses DoubleMetaphone specific calls (including alternate encoding)
|
||||||
|
(Todd Feak via ryan)
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -247,6 +247,14 @@
|
||||||
/>
|
/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldtype>
|
||||||
|
|
||||||
|
|
||||||
<!-- since fields of this type are by default not stored or indexed, any data added to
|
<!-- since fields of this type are by default not stored or indexed, any data added to
|
||||||
them will be ignored outright
|
them will be ignored outright
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
public class DoubleMetaphoneFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private static final String TOKEN_TYPE = "DoubleMetaphone";
|
||||||
|
|
||||||
|
private final LinkedList<Token> remainingTokens = new LinkedList<Token>();
|
||||||
|
private final DoubleMetaphone encoder = new DoubleMetaphone();
|
||||||
|
private final boolean inject;
|
||||||
|
|
||||||
|
protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
|
||||||
|
super(input);
|
||||||
|
this.encoder.setMaxCodeLen(maxCodeLength);
|
||||||
|
this.inject = inject;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final Token next(Token in) throws IOException {
|
||||||
|
if (!remainingTokens.isEmpty()) {
|
||||||
|
return remainingTokens.removeFirst();
|
||||||
|
}
|
||||||
|
|
||||||
|
Token t = input.next(in);
|
||||||
|
if (t != null) {
|
||||||
|
if (inject) {
|
||||||
|
remainingTokens.addLast(t);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isPhonetic = false;
|
||||||
|
String v = new String(t.termBuffer(), 0, t.termLength());
|
||||||
|
String primaryPhoneticValue = encoder.doubleMetaphone(v);
|
||||||
|
if (primaryPhoneticValue.length() > 0) {
|
||||||
|
Token token = (Token) t.clone();
|
||||||
|
if( inject ) {
|
||||||
|
token.setPositionIncrement( 0 );
|
||||||
|
}
|
||||||
|
token.setType( TOKEN_TYPE );
|
||||||
|
token.setTermBuffer(primaryPhoneticValue);
|
||||||
|
remainingTokens.addLast(token);
|
||||||
|
isPhonetic = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
|
||||||
|
if (alternatePhoneticValue.length() > 0
|
||||||
|
&& !primaryPhoneticValue.equals(alternatePhoneticValue)) {
|
||||||
|
Token token = (Token) t.clone();
|
||||||
|
token.setPositionIncrement( 0 );
|
||||||
|
token.setType( TOKEN_TYPE );
|
||||||
|
token.setTermBuffer(alternatePhoneticValue);
|
||||||
|
remainingTokens.addLast(token);
|
||||||
|
isPhonetic = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we did not add something, then go to the next one...
|
||||||
|
if( !isPhonetic ) {
|
||||||
|
t = next(in);
|
||||||
|
if( t != null ) {
|
||||||
|
t.setPositionIncrement( t.getPositionIncrement()+1 );
|
||||||
|
}
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return remainingTokens.isEmpty() ? null : remainingTokens.removeFirst();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory
|
||||||
|
{
|
||||||
|
public static final String INJECT = "inject";
|
||||||
|
public static final String MAX_CODE_LENGTH = "maxCodeLength";
|
||||||
|
|
||||||
|
public static final int DEFAULT_MAX_CODE_LENGTH = 4;
|
||||||
|
|
||||||
|
private boolean inject = true;
|
||||||
|
private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(Map<String, String> args) {
|
||||||
|
super.init(args);
|
||||||
|
|
||||||
|
if (args.get(INJECT) != null) {
|
||||||
|
inject = Boolean.getBoolean(args.get(INJECT));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (args.get(MAX_CODE_LENGTH) != null) {
|
||||||
|
maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public DoubleMetaphoneFilter create(TokenStream input) {
|
||||||
|
return new DoubleMetaphoneFilter(input, maxCodeLength, inject);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,71 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
|
||||||
|
|
||||||
|
public class DoubleMetaphoneFilterFactoryTest extends TestCase {
|
||||||
|
|
||||||
|
public void testDefaults() throws Exception {
|
||||||
|
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||||
|
factory.init(new HashMap<String, String>());
|
||||||
|
TokenStream inputStream = new IterTokenStream("international");
|
||||||
|
|
||||||
|
TokenStream filteredStream = factory.create(inputStream);
|
||||||
|
|
||||||
|
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||||
|
|
||||||
|
Token token = filteredStream.next(new Token());
|
||||||
|
assertEquals(13, token.termLength());
|
||||||
|
assertEquals("international", new String(token.termBuffer(), 0, token
|
||||||
|
.termLength()));
|
||||||
|
|
||||||
|
token = filteredStream.next(new Token());
|
||||||
|
assertEquals(4, token.termLength());
|
||||||
|
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
|
||||||
|
assertNull(filteredStream.next(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSettingSizeAndInject() throws Exception {
|
||||||
|
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||||
|
Map<String, String> parameters = new HashMap<String, String>();
|
||||||
|
parameters.put("inject", "false");
|
||||||
|
parameters.put("maxCodeLength", "8");
|
||||||
|
factory.init(parameters);
|
||||||
|
|
||||||
|
TokenStream inputStream = new IterTokenStream("international");
|
||||||
|
|
||||||
|
TokenStream filteredStream = factory.create(inputStream);
|
||||||
|
|
||||||
|
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||||
|
|
||||||
|
Token token = filteredStream.next(new Token());
|
||||||
|
assertEquals(8, token.termLength());
|
||||||
|
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
|
||||||
|
.termLength()));
|
||||||
|
|
||||||
|
assertNull(filteredStream.next(new Token()));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,109 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
|
||||||
|
|
||||||
|
public class DoubleMetaphoneFilterTest extends TestCase {
|
||||||
|
|
||||||
|
public void testSize4FalseInject() throws Exception {
|
||||||
|
TokenStream stream = new IterTokenStream("international");
|
||||||
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||||
|
|
||||||
|
Token token = filter.next(new Token());
|
||||||
|
assertEquals(4, token.termLength());
|
||||||
|
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
|
||||||
|
assertNull(filter.next(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSize4TrueInject() throws Exception {
|
||||||
|
TokenStream stream = new IterTokenStream("international");
|
||||||
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||||
|
|
||||||
|
Token token = filter.next(new Token());
|
||||||
|
assertEquals(13, token.termLength());
|
||||||
|
assertEquals("international", new String(token.termBuffer(), 0, token
|
||||||
|
.termLength()));
|
||||||
|
|
||||||
|
token = filter.next(new Token());
|
||||||
|
assertEquals(4, token.termLength());
|
||||||
|
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
|
||||||
|
assertNull(filter.next(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAlternateInjectFalse() throws Exception {
|
||||||
|
TokenStream stream = new IterTokenStream("Kuczewski");
|
||||||
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||||
|
|
||||||
|
Token token = filter.next(new Token());
|
||||||
|
assertEquals(4, token.termLength());
|
||||||
|
assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
|
||||||
|
token = filter.next(new Token());
|
||||||
|
assertEquals(4, token.termLength());
|
||||||
|
assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
assertNull(filter.next(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSize8FalseInject() throws Exception {
|
||||||
|
TokenStream stream = new IterTokenStream("international");
|
||||||
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
|
|
||||||
|
Token token = filter.next(new Token());
|
||||||
|
assertEquals(8, token.termLength());
|
||||||
|
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
|
||||||
|
.termLength()));
|
||||||
|
|
||||||
|
assertNull(filter.next(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNonConvertableStringsWithInject() throws Exception {
|
||||||
|
TokenStream stream = new IterTokenStream(
|
||||||
|
new String[] { "12345", "#$%@#^%&" });
|
||||||
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||||
|
|
||||||
|
Token token = filter.next(new Token());
|
||||||
|
assertEquals(5, token.termLength());
|
||||||
|
assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
|
||||||
|
|
||||||
|
token = filter.next(new Token());
|
||||||
|
assertEquals(8, token.termLength());
|
||||||
|
assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
|
||||||
|
.termLength()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||||
|
TokenStream stream = new IterTokenStream(
|
||||||
|
new String[] { "12345", "#$%@#^%&" });
|
||||||
|
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
|
|
||||||
|
assertNull(filter.next(new Token()));
|
||||||
|
|
||||||
|
// should have something after the stream
|
||||||
|
stream = new IterTokenStream(
|
||||||
|
new String[] { "12345", "#$%@#^%&", "hello" });
|
||||||
|
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||||
|
assertNotNull(filter.next(new Token()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue