Phonetic Filter : Double Metaphone, partial implementation (not using the secondary code), closes #924.

This commit is contained in:
kimchy 2011-05-11 22:51:11 +03:00
parent 133305da44
commit aa9730834f
2 changed files with 117 additions and 1 deletions

View File

@ -0,0 +1,111 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis.phonetic;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.common.codec.language.DoubleMetaphone;
import java.io.IOException;
import java.util.LinkedList;
public final class DoubleMetaphoneFilter extends TokenFilter {
private static final String TOKEN_TYPE = "DoubleMetaphone";
private final LinkedList<State> remainingTokens = new LinkedList<State>();
private final DoubleMetaphone encoder;
private final boolean inject;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
public DoubleMetaphoneFilter(TokenStream input, DoubleMetaphone encoder, boolean inject) {
super(input);
this.encoder = encoder;
this.inject = inject;
}
@Override
public boolean incrementToken() throws IOException {
for (; ;) {
if (!remainingTokens.isEmpty()) {
// clearAttributes(); // not currently necessary
restoreState(remainingTokens.removeFirst());
return true;
}
if (!input.incrementToken()) return false;
int len = termAtt.length();
if (len == 0) return true; // pass through zero length terms
int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement();
String v = termAtt.toString();
String primaryPhoneticValue = encoder.doubleMetaphone(v);
String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
// a flag to lazily save state if needed... this avoids a save/restore when only
// one token will be generated.
boolean saveState = inject;
if (primaryPhoneticValue != null && primaryPhoneticValue.length() > 0 && !primaryPhoneticValue.equals(v)) {
if (saveState) {
remainingTokens.addLast(captureState());
}
posAtt.setPositionIncrement(firstAlternativeIncrement);
firstAlternativeIncrement = 0;
termAtt.setEmpty().append(primaryPhoneticValue);
saveState = true;
}
if (alternatePhoneticValue != null && alternatePhoneticValue.length() > 0
&& !alternatePhoneticValue.equals(primaryPhoneticValue)
&& !primaryPhoneticValue.equals(v)) {
if (saveState) {
remainingTokens.addLast(captureState());
saveState = false;
}
posAtt.setPositionIncrement(firstAlternativeIncrement);
termAtt.setEmpty().append(alternatePhoneticValue);
saveState = true;
}
// Just one token to return, so no need to capture/restore
// any state, simply return it.
if (remainingTokens.isEmpty()) {
return true;
}
if (saveState) {
remainingTokens.addLast(captureState());
}
}
}
@Override
public void reset() throws IOException {
input.reset();
remainingTokens.clear();
}
}

View File

@ -57,13 +57,18 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
} else if ("refined_soundex".equalsIgnoreCase(encoder) || "refinedSoundex".equalsIgnoreCase(encoder)) {
this.encoder = new RefinedSoundex();
} else if ("double_metaphone".equalsIgnoreCase(encoder) || "doubleMetaphone".equalsIgnoreCase(encoder)) {
this.encoder = new DoubleMetaphone();
DoubleMetaphone doubleMetaphone = new DoubleMetaphone();
doubleMetaphone.setMaxCodeLen(settings.getAsInt("max_code_len", doubleMetaphone.getMaxCodeLen()));
this.encoder = doubleMetaphone;
} else {
throw new ElasticSearchIllegalArgumentException("unknown encoder [" + encoder + "] for phonetic token filter");
}
}
@Override public TokenStream create(TokenStream tokenStream) {
if (encoder instanceof DoubleMetaphone) {
return new DoubleMetaphoneFilter(tokenStream, (DoubleMetaphone) encoder, inject);
}
return new PhoneticFilter(tokenStream, encoder, name(), inject);
}
}