SOLR-2982: add Beider-Morse phonetic filter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1225211 13f79535-47bb-0310-9956-ffa450edef68
2011-12-28 16:00:52 +00:00 · 2011-12-28 16:00:52 +00:00 · f3869ef3ce
parent 21822811a9
commit f3869ef3ce
11 changed files with 350 additions and 8 deletions
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@ -87,7 +87,7 @@
 	<classpathentry kind="lib" path="lucene/lib/junit-4.7.jar"/>
 	<classpathentry kind="lib" path="lucene/contrib/sandbox/lib/jakarta-regexp-1.4.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8_1_1.jar"/>
-	<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.5.jar"/>
+	<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.6.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
 	<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
@ -98,7 +98,6 @@
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
 	<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1209632.jar"/>
-	<classpathentry kind="lib" path="solr/lib/commons-codec-1.5.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-csv-1.0-SNAPSHOT-r966014.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-httpclient-3.1.jar"/>
--- a/modules/analysis/phonetic/lib/commons-codec-1.5.jar
+++ b/modules/analysis/phonetic/lib/commons-codec-1.5.jar
@ -1,2 +0,0 @@
-AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
-Apache SVN contains full history.
--- a/modules/analysis/phonetic/lib/commons-codec-1.6.jar
+++ b/modules/analysis/phonetic/lib/commons-codec-1.6.jar
@ -0,0 +1,2 @@
+AnyObjectId[ee1bc49acae11cc79eceec51f7be785590e99fd8] was removed in git history.
+Apache SVN contains full history.
--- a/modules/analysis/phonetic/lib/commons-codec-NOTICE.txt
+++ b/modules/analysis/phonetic/lib/commons-codec-NOTICE.txt
@ -1,5 +1,5 @@
 Apache Commons Codec
-Copyright 2002-2009 The Apache Software Foundation
+Copyright 2002-2011 The Apache Software Foundation

 This product includes software developed by
 The Apache Software Foundation (http://www.apache.org/).
--- a/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
+++ b/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
@ -0,0 +1,109 @@
+package org.apache.lucene.analysis.phonetic;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.codec.language.bm.BeiderMorseEncoder; // javadocs
+import org.apache.commons.codec.language.bm.Languages.LanguageSet;
+import org.apache.commons.codec.language.bm.PhoneticEngine;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * TokenFilter for Beider-Morse phonetic encoding.
+ * @see BeiderMorseEncoder
+ */
+public final class BeiderMorseFilter extends TokenFilter {
+  private final PhoneticEngine engine;
+  private final LanguageSet languages;
+  
+  // output is a string such as ab|ac|...
+  // in complex cases like d'angelo its (anZelo|andZelo|...)-(danZelo|...)
+  // if there are multiple 's, it starts to nest...
+  private static final Pattern pattern = Pattern.compile("([^()|-]+)");
+  
+  // matcher over any buffered output
+  private final Matcher matcher = pattern.matcher("");
+  // encoded representation
+  private String encoded;
+  // offsets for any buffered outputs
+  private int startOffset;
+  private int endOffset;
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  
+  /** 
+   * Calls {@link #BeiderMorseFilter(TokenStream, PhoneticEngine, LanguageSet) 
+   *        BeiderMorseFilter(input, engine, null)}
+   */
+  public BeiderMorseFilter(TokenStream input, PhoneticEngine engine) {
+    this(input, engine, null);
+  }
+
+  /**
+   * Create a new BeiderMorseFilter
+   * @param input TokenStream to filter
+   * @param engine configured PhoneticEngine with BM settings.
+   * @param languages optional Set of original languages. Can be null (which means it will be guessed).
+   */
+  public BeiderMorseFilter(TokenStream input, PhoneticEngine engine, LanguageSet languages) {
+    super(input);
+    this.engine = engine;
+    this.languages = languages;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (matcher.find()) {
+      clearAttributes();
+      termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
+      posIncAtt.setPositionIncrement(0);
+      offsetAtt.setOffset(startOffset, endOffset);
+      return true;
+    }
+    
+    if (input.incrementToken()) {
+      encoded = (languages == null) 
+          ? engine.encode(termAtt.toString())
+          : engine.encode(termAtt.toString(), languages);
+      startOffset = offsetAtt.startOffset();
+      endOffset = offsetAtt.endOffset();
+      matcher.reset(encoded);
+      if (matcher.find()) {
+        termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    matcher.reset("");
+  }
+}
--- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java
+++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java
@ -0,0 +1,92 @@
+package org.apache.lucene.analysis.phonetic;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.HashSet;
+
+import org.apache.commons.codec.language.bm.NameType;
+import org.apache.commons.codec.language.bm.PhoneticEngine;
+import org.apache.commons.codec.language.bm.RuleType;
+import org.apache.commons.codec.language.bm.Languages.LanguageSet;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+/** Tests {@link BeiderMorseFilter} */
+public class TestBeiderMorseFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      return new TokenStreamComponents(tokenizer, 
+          new BeiderMorseFilter(tokenizer, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true)));
+    }
+  };
+  
+  /** generic, "exact" configuration */
+  public void testBasicUsage() throws Exception {    
+    assertAnalyzesTo(analyzer, "Angelo",
+        new String[] { "anZelo", "andZelo", "angelo", "anhelo", "anjelo", "anxelo" },
+        new int[] { 0, 0, 0, 0, 0, 0 },
+        new int[] { 6, 6, 6, 6, 6, 6 },
+        new int[] { 1, 0, 0, 0, 0, 0 });
+    
+    assertAnalyzesTo(analyzer, "D'Angelo",
+        new String[] { "anZelo", "andZelo", "angelo", "anhelo", "anjelo", "anxelo",
+                  "danZelo", "dandZelo", "dangelo", "danhelo", "danjelo", "danxelo" },
+        new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+        new int[] { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 },
+        new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
+  }
+  
+  /** restrict the output to a set of possible origin languages */
+  public void testLanguageSet() throws Exception {
+    final LanguageSet languages = LanguageSet.from(new HashSet<String>() {{
+      add("italian"); add("greek"); add("spanish");
+    }});
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, 
+            new BeiderMorseFilter(tokenizer, 
+                new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true), languages));
+      }
+    };
+    assertAnalyzesTo(analyzer, "Angelo",
+        new String[] { "andZelo", "angelo", "anxelo" },
+        new int[] { 0, 0, 0, },
+        new int[] { 6, 6, 6, },
+        new int[] { 1, 0, 0, });
+  }
+  
+  /** for convenience, if the input yields no output, we pass it thru as-is */
+  public void testNumbers() throws Exception {
+    assertAnalyzesTo(analyzer, "1234",
+        new String[] { "1234" },
+        new int[] { 0 },
+        new int[] { 4 },
+        new int[] { 1 });
+  }
+  
+  public void testRandom() throws Exception {
+    checkRandomData(random, analyzer, 1000 * RANDOM_MULTIPLIER); 
+  }
+}
--- a/solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
@ -0,0 +1,76 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+
+import org.apache.commons.codec.language.bm.Languages.LanguageSet;
+import org.apache.commons.codec.language.bm.NameType;
+import org.apache.commons.codec.language.bm.PhoneticEngine;
+import org.apache.commons.codec.language.bm.RuleType;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
+
+/** 
+ * Factory for {@link BeiderMorseFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_bm" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.BeiderMorseFilterFactory"
+ *        nameType="GENERIC" ruleType="APPROX" 
+ *        concat="true" languageSet="auto"
+ *     &lt;/filter&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+public class BeiderMorseFilterFactory extends BaseTokenFilterFactory {
+  private PhoneticEngine engine;
+  private LanguageSet languageSet;
+  
+  public void init(Map<String,String> args) {
+    super.init(args);
+    
+    // PhoneticEngine = NameType + RuleType + concat
+    // we use common-codec's defaults: GENERIC + APPROX + true
+    String nameTypeArg = args.get("nameType");
+    NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
+
+    String ruleTypeArg = args.get("ruleType");
+    RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
+    
+    boolean concat = getBoolean("concat", true);
+    engine = new PhoneticEngine(nameType, ruleType, concat);
+    
+    // LanguageSet: defaults to automagic, otherwise a comma-separated list.
+    String languageSetArg = args.get("languageSet");
+    if (languageSetArg == null || languageSetArg.equals("auto")) {
+      languageSet = null;
+    } else {
+      languageSet = LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new BeiderMorseFilter(input, engine, languageSet);
+  }
+}
--- a/solr/core/src/test/org/apache/solr/analysis/TestBeiderMorseFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestBeiderMorseFilterFactory.java
@ -0,0 +1,66 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/** Simple tests for {@link BeiderMorseFilterFactory} */
+public class TestBeiderMorseFilterFactory extends BaseTokenTestCase {
+  public void testBasics() throws Exception {
+    BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false));
+    assertTokenStreamContents(ts,
+        new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" },
+        new int[] { 0, 0, 0, 0, 0, 0 },
+        new int[] { 8, 8, 8, 8, 8, 8 },
+        new int[] { 1, 0, 0, 0, 0, 0 });
+  }
+  
+  public void testLanguageSet() throws Exception {
+    BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("languageSet", "polish");
+    factory.init(args);
+    TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false));
+    assertTokenStreamContents(ts,
+        new String[] { "vDmbYrk", "vDmbirk", "vambYrk", "vambirk", "vimbYrk", "vimbirk" },
+        new int[] { 0, 0, 0, 0, 0, 0 },
+        new int[] { 8, 8, 8, 8, 8, 8 },
+        new int[] { 1, 0, 0, 0, 0, 0 });
+  }
+  
+  public void testOptions() throws Exception {
+    BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("nameType", "ASHKENAZI");
+    args.put("ruleType", "EXACT");
+    factory.init(args);
+    TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false));
+    assertTokenStreamContents(ts,
+        new String[] { "vajnberk" },
+        new int[] { 0 },
+        new int[] { 8 },
+        new int[] { 1 });
+  }
+}
--- a/solr/lib/commons-codec-1.5.jar
+++ b/solr/lib/commons-codec-1.5.jar
@ -1,2 +0,0 @@
-AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
-Apache SVN contains full history.
--- a/solr/lib/commons-codec-1.6.jar
+++ b/solr/lib/commons-codec-1.6.jar
@ -0,0 +1,2 @@
+AnyObjectId[ee1bc49acae11cc79eceec51f7be785590e99fd8] was removed in git history.
+Apache SVN contains full history.
--- a/solr/lib/commons-codec-NOTICE.txt
+++ b/solr/lib/commons-codec-NOTICE.txt
@ -1,5 +1,5 @@
 Apache Commons Codec
-Copyright 2002-2009 The Apache Software Foundation
+Copyright 2002-2011 The Apache Software Foundation

 This product includes software developed by
 The Apache Software Foundation (http://www.apache.org/).