Phonetic Filter : Double Metaphone, partial implementation (not using the secondary code), closes #924.

2025-03-09 14:34:43 +00:00 · 2011-05-11 22:51:11 +03:00 · 2011-05-11 22:51:11 +03:00 · aa9730834f
commit aa9730834f
parent 133305da44
2 changed files with 117 additions and 1 deletions
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/phonetic/DoubleMetaphoneFilter.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/phonetic/DoubleMetaphoneFilter.java
@ -0,0 +1,111 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis.phonetic;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.elasticsearch.common.codec.language.DoubleMetaphone;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+public final class DoubleMetaphoneFilter extends TokenFilter {
+
+    private static final String TOKEN_TYPE = "DoubleMetaphone";
+
+    private final LinkedList<State> remainingTokens = new LinkedList<State>();
+    private final DoubleMetaphone encoder;
+    private final boolean inject;
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
+
+    public DoubleMetaphoneFilter(TokenStream input, DoubleMetaphone encoder, boolean inject) {
+        super(input);
+        this.encoder = encoder;
+        this.inject = inject;
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+        for (; ;) {
+
+            if (!remainingTokens.isEmpty()) {
+                // clearAttributes();  // not currently necessary
+                restoreState(remainingTokens.removeFirst());
+                return true;
+            }
+
+            if (!input.incrementToken()) return false;
+
+            int len = termAtt.length();
+            if (len == 0) return true; // pass through zero length terms
+
+            int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement();
+
+            String v = termAtt.toString();
+            String primaryPhoneticValue = encoder.doubleMetaphone(v);
+            String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
+
+            // a flag to lazily save state if needed... this avoids a save/restore when only
+            // one token will be generated.
+            boolean saveState = inject;
+
+            if (primaryPhoneticValue != null && primaryPhoneticValue.length() > 0 && !primaryPhoneticValue.equals(v)) {
+                if (saveState) {
+                    remainingTokens.addLast(captureState());
+                }
+                posAtt.setPositionIncrement(firstAlternativeIncrement);
+                firstAlternativeIncrement = 0;
+                termAtt.setEmpty().append(primaryPhoneticValue);
+                saveState = true;
+            }
+
+            if (alternatePhoneticValue != null && alternatePhoneticValue.length() > 0
+                    && !alternatePhoneticValue.equals(primaryPhoneticValue)
+                    && !primaryPhoneticValue.equals(v)) {
+                if (saveState) {
+                    remainingTokens.addLast(captureState());
+                    saveState = false;
+                }
+                posAtt.setPositionIncrement(firstAlternativeIncrement);
+                termAtt.setEmpty().append(alternatePhoneticValue);
+                saveState = true;
+            }
+
+            // Just one token to return, so no need to capture/restore
+            // any state, simply return it.
+            if (remainingTokens.isEmpty()) {
+                return true;
+            }
+
+            if (saveState) {
+                remainingTokens.addLast(captureState());
+            }
+        }
+    }
+
+    @Override
+    public void reset() throws IOException {
+        input.reset();
+        remainingTokens.clear();
+    }
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/phonetic/PhoneticTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/phonetic/PhoneticTokenFilterFactory.java
@ -57,13 +57,18 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
        } else if ("refined_soundex".equalsIgnoreCase(encoder) || "refinedSoundex".equalsIgnoreCase(encoder)) {
            this.encoder = new RefinedSoundex();
        } else if ("double_metaphone".equalsIgnoreCase(encoder) || "doubleMetaphone".equalsIgnoreCase(encoder)) {
-            this.encoder = new DoubleMetaphone();
+            DoubleMetaphone doubleMetaphone = new DoubleMetaphone();
+            doubleMetaphone.setMaxCodeLen(settings.getAsInt("max_code_len", doubleMetaphone.getMaxCodeLen()));
+            this.encoder = doubleMetaphone;
        } else {
            throw new ElasticSearchIllegalArgumentException("unknown encoder [" + encoder + "] for phonetic token filter");
        }
    }

    @Override public TokenStream create(TokenStream tokenStream) {
+        if (encoder instanceof DoubleMetaphone) {
+            return new DoubleMetaphoneFilter(tokenStream, (DoubleMetaphone) encoder, inject);
+        }
        return new PhoneticFilter(tokenStream, encoder, name(), inject);
    }
 }