From e140b8907c50bd14548e931922f3e24f07bdc8df Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 11 Nov 2014 18:47:14 +0000 Subject: [PATCH] LUCENE-6059: add Daitch-Mokotoff Soundex phonetic Apache commons phonetic codec git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1638250 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../phonetic/DaitchMokotoffSoundexFilter.java | 100 ++++++++++++++++++ .../DaitchMokotoffSoundexFilterFactory.java | 66 ++++++++++++ .../TestBeiderMorseFilterFactory.java | 10 +- .../TestDaitchMokotoffSoundexFilter.java | 84 +++++++++++++++ ...estDaitchMokotoffSoundexFilterFactory.java | 65 ++++++++++++ lucene/ivy-versions.properties | 2 +- lucene/licenses/commons-codec-1.10.jar.sha1 | 1 + lucene/licenses/commons-codec-1.9.jar.sha1 | 1 - solr/licenses/commons-codec-1.10.jar.sha1 | 1 + solr/licenses/commons-codec-1.9.jar.sha1 | 1 - 11 files changed, 326 insertions(+), 9 deletions(-) create mode 100644 lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java create mode 100644 lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java create mode 100644 lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java create mode 100644 lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java create mode 100644 lucene/licenses/commons-codec-1.10.jar.sha1 delete mode 100644 lucene/licenses/commons-codec-1.9.jar.sha1 create mode 100644 solr/licenses/commons-codec-1.10.jar.sha1 delete mode 100644 solr/licenses/commons-codec-1.9.jar.sha1 diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8854cbe9fae..1d67752310e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -82,6 +82,10 @@ New Features * LUCENE-4400: Add support for new NYSIIS Apache commons phonetic codec (Thomas Neidhart via Mike McCandless) +* LUCENE-6059: Add Daitch-Mokotoff Soundex phonetic Apache commons + phonetic codec, and upgrade to Apache commons codec 1.10 (Thomas + Neidhart via Mike McCandless) + API Changes * LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and diff --git a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java new file mode 100644 index 00000000000..a27bebf3566 --- /dev/null +++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilter.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.phonetic; + +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.codec.language.DaitchMokotoffSoundex; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** + * Create tokens for phonetic matches based on Daitch–Mokotoff Soundex. + * + * @lucene.experimental + */ +public final class DaitchMokotoffSoundexFilter extends TokenFilter { + /** true if encoded tokens should be added as synonyms */ + protected boolean inject = true; + /** phonetic encoder */ + protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex(); + + // output is a string such as ab|ac|... + private static final Pattern pattern = Pattern.compile("([^|]+)"); + + // matcher over any buffered output + private final Matcher matcher = pattern.matcher(""); + + // encoded representation + private String encoded; + // preserves all attributes for any buffered outputs + private State state; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); + + /** + * Creates a DaitchMokotoffSoundexFilter by either adding encoded forms as synonyms ( + * inject=true) or replacing them. + */ + public DaitchMokotoffSoundexFilter(TokenStream in, boolean inject) { + super(in); + this.inject = inject; + } + + @Override + public boolean incrementToken() throws IOException { + if (matcher.find()) { + assert state != null && encoded != null; + restoreState(state); + termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1)); + posAtt.setPositionIncrement(0); + return true; + } + + if (input.incrementToken()) { + // pass through zero-length terms + if (termAtt.length() == 0) { + return true; + } + + encoded = encoder.soundex(termAtt.toString()); + state = captureState(); + matcher.reset(encoded); + + if (!inject) { + if (matcher.find()) { + termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1)); + } + } + return true; + } else { + return false; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + matcher.reset(""); + state = null; + } +} diff --git a/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java new file mode 100644 index 00000000000..aa2d3d52c85 --- /dev/null +++ b/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DaitchMokotoffSoundexFilterFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.phonetic; + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link DaitchMokotoffSoundexFilter}. + * + * Create tokens based on Daitch–Mokotoff Soundex phonetic filter. + *

+ * This takes one optional argument: + *

+ *
inject
(default=true) add tokens to the stream with the offset=0
+ *
+ * + *
+ * <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.DaitchMokotoffSoundexFilterFactory" inject="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @see DaitchMokotoffSoundexFilter + * + * @lucene.experimental + */ +public class DaitchMokotoffSoundexFilterFactory extends TokenFilterFactory { + /** parameter name: true if encoded tokens should be added as synonyms */ + public static final String INJECT = "inject"; // boolean + + final boolean inject; //accessed by the test + + /** Creates a new PhoneticFilterFactory */ + public DaitchMokotoffSoundexFilterFactory(Map args) { + super(args); + inject = getBoolean(args, INJECT, true); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public DaitchMokotoffSoundexFilter create(TokenStream input) { + return new DaitchMokotoffSoundexFilter(input, inject); + } + +} diff --git a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java index b6151d02e41..ece793f0be0 100644 --- a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java +++ b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java @@ -17,12 +17,10 @@ package org.apache.lucene.analysis.phonetic; * limitations under the License. */ -import java.io.StringReader; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; /** Simple tests for {@link BeiderMorseFilterFactory} */ @@ -31,10 +29,10 @@ public class TestBeiderMorseFilterFactory extends BaseTokenStreamTestCase { BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(new HashMap()); TokenStream ts = factory.create(whitespaceMockTokenizer("Weinberg")); assertTokenStreamContents(ts, - new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" }, - new int[] { 0, 0, 0, 0, 0, 0 }, - new int[] { 8, 8, 8, 8, 8, 8 }, - new int[] { 1, 0, 0, 0, 0, 0 }); + new String[] { "vDnbYrk", "vDnbirk", "vanbYrk", "vanbirk", "vinbYrk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" }, + new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0}, + new int[] { 8, 8, 8, 8, 8, 8, 8, 8, 8}, + new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0}); } public void testLanguageSet() throws Exception { diff --git a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java new file mode 100644 index 00000000000..4b0d436cd38 --- /dev/null +++ b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilter.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.phonetic; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Tests {@link DaitchMokotoffSoundexFilter} + */ +public class TestDaitchMokotoffSoundexFilter extends BaseTokenStreamTestCase { + + public void testAlgorithms() throws Exception { + assertAlgorithm(true, "aaa bbb ccc easgasg", + new String[] { "aaa", "000000", "bbb", "700000", "ccc", "400000", "450000", "454000", + "540000", "545000", "500000", "easgasg", "045450" }); + assertAlgorithm(false, "aaa bbb ccc easgasg", + new String[] { "000000", "700000", "400000", "450000", "454000", "540000", "545000", + "500000", "045450" }); + } + + static void assertAlgorithm(boolean inject, String input, String[] expected) throws Exception { + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(input)); + DaitchMokotoffSoundexFilter filter = new DaitchMokotoffSoundexFilter(tokenizer, inject); + assertTokenStreamContents(filter, expected); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false)); + } + }; + + checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); + + Analyzer b = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false)); + } + }; + + checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, random().nextBoolean())); + } + }; + checkOneTerm(a, "", ""); + } + +} diff --git a/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java new file mode 100644 index 00000000000..8dc74e002d0 --- /dev/null +++ b/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDaitchMokotoffSoundexFilterFactory.java @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +public class TestDaitchMokotoffSoundexFilterFactory extends BaseTokenStreamTestCase { + + public void testDefaults() throws Exception { + DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(new HashMap()); + Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false); + inputStream.setReader(new StringReader("international")); + + TokenStream filteredStream = factory.create(inputStream); + assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass()); + assertTokenStreamContents(filteredStream, new String[] { "international", "063963" }); + } + + public void testSettingInject() throws Exception { + Map parameters = new HashMap<>(); + parameters.put("inject", "false"); + DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(parameters); + + Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false); + inputStream.setReader(new StringReader("international")); + + TokenStream filteredStream = factory.create(inputStream); + assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass()); + assertTokenStreamContents(filteredStream, new String[] { "063963" }); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + try { + new DaitchMokotoffSoundexFilterFactory(new HashMap() {{ + put("bogusArg", "bogusValue"); + }}); + fail(); + } catch (IllegalArgumentException expected) { + assertTrue(expected.getMessage().contains("Unknown parameters")); + } + } +} diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index eb3072745d4..bc83109b3c4 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -55,7 +55,7 @@ com.sun.jersey.version = 1.9 /com.uwyn/jhighlight = 1.0 /commons-beanutils/commons-beanutils = 1.8.3 /commons-cli/commons-cli = 1.2 -/commons-codec/commons-codec = 1.9 +/commons-codec/commons-codec = 1.10 /commons-collections/commons-collections = 3.2.1 /commons-configuration/commons-configuration = 1.6 /commons-digester/commons-digester = 2.1 diff --git a/lucene/licenses/commons-codec-1.10.jar.sha1 b/lucene/licenses/commons-codec-1.10.jar.sha1 new file mode 100644 index 00000000000..ebd32cee723 --- /dev/null +++ b/lucene/licenses/commons-codec-1.10.jar.sha1 @@ -0,0 +1 @@ +4b95f4897fa13f2cd904aee711aeafc0c5295cd8 diff --git a/lucene/licenses/commons-codec-1.9.jar.sha1 b/lucene/licenses/commons-codec-1.9.jar.sha1 deleted file mode 100644 index 5a14ad7393f..00000000000 --- a/lucene/licenses/commons-codec-1.9.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -9ce04e34240f674bc72680f8b843b1457383161a diff --git a/solr/licenses/commons-codec-1.10.jar.sha1 b/solr/licenses/commons-codec-1.10.jar.sha1 new file mode 100644 index 00000000000..ebd32cee723 --- /dev/null +++ b/solr/licenses/commons-codec-1.10.jar.sha1 @@ -0,0 +1 @@ +4b95f4897fa13f2cd904aee711aeafc0c5295cd8 diff --git a/solr/licenses/commons-codec-1.9.jar.sha1 b/solr/licenses/commons-codec-1.9.jar.sha1 deleted file mode 100644 index 5a14ad7393f..00000000000 --- a/solr/licenses/commons-codec-1.9.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -9ce04e34240f674bc72680f8b843b1457383161a