mirror of https://github.com/apache/lucene.git
LUCENE-6059: add Daitch-Mokotoff Soundex phonetic Apache commons phonetic codec
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1638250 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
937f81b03c
commit
e140b8907c
|
@ -82,6 +82,10 @@ New Features
|
||||||
* LUCENE-4400: Add support for new NYSIIS Apache commons phonetic
|
* LUCENE-4400: Add support for new NYSIIS Apache commons phonetic
|
||||||
codec (Thomas Neidhart via Mike McCandless)
|
codec (Thomas Neidhart via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-6059: Add Daitch-Mokotoff Soundex phonetic Apache commons
|
||||||
|
phonetic codec, and upgrade to Apache commons codec 1.10 (Thomas
|
||||||
|
Neidhart via Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
|
||||||
|
|
|
@ -0,0 +1,100 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.language.DaitchMokotoffSoundex;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create tokens for phonetic matches based on Daitch–Mokotoff Soundex.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class DaitchMokotoffSoundexFilter extends TokenFilter {
|
||||||
|
/** true if encoded tokens should be added as synonyms */
|
||||||
|
protected boolean inject = true;
|
||||||
|
/** phonetic encoder */
|
||||||
|
protected DaitchMokotoffSoundex encoder = new DaitchMokotoffSoundex();
|
||||||
|
|
||||||
|
// output is a string such as ab|ac|...
|
||||||
|
private static final Pattern pattern = Pattern.compile("([^|]+)");
|
||||||
|
|
||||||
|
// matcher over any buffered output
|
||||||
|
private final Matcher matcher = pattern.matcher("");
|
||||||
|
|
||||||
|
// encoded representation
|
||||||
|
private String encoded;
|
||||||
|
// preserves all attributes for any buffered outputs
|
||||||
|
private State state;
|
||||||
|
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a DaitchMokotoffSoundexFilter by either adding encoded forms as synonyms (
|
||||||
|
* <code>inject=true</code>) or replacing them.
|
||||||
|
*/
|
||||||
|
public DaitchMokotoffSoundexFilter(TokenStream in, boolean inject) {
|
||||||
|
super(in);
|
||||||
|
this.inject = inject;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (matcher.find()) {
|
||||||
|
assert state != null && encoded != null;
|
||||||
|
restoreState(state);
|
||||||
|
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
|
||||||
|
posAtt.setPositionIncrement(0);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
// pass through zero-length terms
|
||||||
|
if (termAtt.length() == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
encoded = encoder.soundex(termAtt.toString());
|
||||||
|
state = captureState();
|
||||||
|
matcher.reset(encoded);
|
||||||
|
|
||||||
|
if (!inject) {
|
||||||
|
if (matcher.find()) {
|
||||||
|
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
matcher.reset("");
|
||||||
|
state = null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,66 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link DaitchMokotoffSoundexFilter}.
|
||||||
|
*
|
||||||
|
* Create tokens based on Daitch–Mokotoff Soundex phonetic filter.
|
||||||
|
* <p>
|
||||||
|
* This takes one optional argument:
|
||||||
|
* <dl>
|
||||||
|
* <dt>inject</dt><dd> (default=true) add tokens to the stream with the offset=0</dd>
|
||||||
|
* </dl>
|
||||||
|
*
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
* <analyzer>
|
||||||
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
* <filter class="solr.DaitchMokotoffSoundexFilterFactory" inject="true"/>
|
||||||
|
* </analyzer>
|
||||||
|
* </fieldType></pre>
|
||||||
|
*
|
||||||
|
* @see DaitchMokotoffSoundexFilter
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class DaitchMokotoffSoundexFilterFactory extends TokenFilterFactory {
|
||||||
|
/** parameter name: true if encoded tokens should be added as synonyms */
|
||||||
|
public static final String INJECT = "inject"; // boolean
|
||||||
|
|
||||||
|
final boolean inject; //accessed by the test
|
||||||
|
|
||||||
|
/** Creates a new PhoneticFilterFactory */
|
||||||
|
public DaitchMokotoffSoundexFilterFactory(Map<String,String> args) {
|
||||||
|
super(args);
|
||||||
|
inject = getBoolean(args, INJECT, true);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DaitchMokotoffSoundexFilter create(TokenStream input) {
|
||||||
|
return new DaitchMokotoffSoundexFilter(input, inject);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -17,12 +17,10 @@ package org.apache.lucene.analysis.phonetic;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
/** Simple tests for {@link BeiderMorseFilterFactory} */
|
/** Simple tests for {@link BeiderMorseFilterFactory} */
|
||||||
|
@ -31,10 +29,10 @@ public class TestBeiderMorseFilterFactory extends BaseTokenStreamTestCase {
|
||||||
BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(new HashMap<String,String>());
|
BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(new HashMap<String,String>());
|
||||||
TokenStream ts = factory.create(whitespaceMockTokenizer("Weinberg"));
|
TokenStream ts = factory.create(whitespaceMockTokenizer("Weinberg"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" },
|
new String[] { "vDnbYrk", "vDnbirk", "vanbYrk", "vanbirk", "vinbYrk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" },
|
||||||
new int[] { 0, 0, 0, 0, 0, 0 },
|
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||||
new int[] { 8, 8, 8, 8, 8, 8 },
|
new int[] { 8, 8, 8, 8, 8, 8, 8, 8, 8},
|
||||||
new int[] { 1, 0, 0, 0, 0, 0 });
|
new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLanguageSet() throws Exception {
|
public void testLanguageSet() throws Exception {
|
||||||
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests {@link DaitchMokotoffSoundexFilter}
|
||||||
|
*/
|
||||||
|
public class TestDaitchMokotoffSoundexFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testAlgorithms() throws Exception {
|
||||||
|
assertAlgorithm(true, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "aaa", "000000", "bbb", "700000", "ccc", "400000", "450000", "454000",
|
||||||
|
"540000", "545000", "500000", "easgasg", "045450" });
|
||||||
|
assertAlgorithm(false, "aaa bbb ccc easgasg",
|
||||||
|
new String[] { "000000", "700000", "400000", "450000", "454000", "540000", "545000",
|
||||||
|
"500000", "045450" });
|
||||||
|
}
|
||||||
|
|
||||||
|
static void assertAlgorithm(boolean inject, String input, String[] expected) throws Exception {
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader(input));
|
||||||
|
DaitchMokotoffSoundexFilter filter = new DaitchMokotoffSoundexFilter(tokenizer, inject);
|
||||||
|
assertTokenStreamContents(filter, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
|
||||||
|
|
||||||
|
Analyzer b = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, random().nextBoolean()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
package org.apache.lucene.analysis.phonetic;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
public class TestDaitchMokotoffSoundexFilterFactory extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testDefaults() throws Exception {
|
||||||
|
DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(new HashMap<String, String>());
|
||||||
|
Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
inputStream.setReader(new StringReader("international"));
|
||||||
|
|
||||||
|
TokenStream filteredStream = factory.create(inputStream);
|
||||||
|
assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass());
|
||||||
|
assertTokenStreamContents(filteredStream, new String[] { "international", "063963" });
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSettingInject() throws Exception {
|
||||||
|
Map<String,String> parameters = new HashMap<>();
|
||||||
|
parameters.put("inject", "false");
|
||||||
|
DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(parameters);
|
||||||
|
|
||||||
|
Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
inputStream.setReader(new StringReader("international"));
|
||||||
|
|
||||||
|
TokenStream filteredStream = factory.create(inputStream);
|
||||||
|
assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass());
|
||||||
|
assertTokenStreamContents(filteredStream, new String[] { "063963" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
try {
|
||||||
|
new DaitchMokotoffSoundexFilterFactory(new HashMap<String,String>() {{
|
||||||
|
put("bogusArg", "bogusValue");
|
||||||
|
}});
|
||||||
|
fail();
|
||||||
|
} catch (IllegalArgumentException expected) {
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -55,7 +55,7 @@ com.sun.jersey.version = 1.9
|
||||||
/com.uwyn/jhighlight = 1.0
|
/com.uwyn/jhighlight = 1.0
|
||||||
/commons-beanutils/commons-beanutils = 1.8.3
|
/commons-beanutils/commons-beanutils = 1.8.3
|
||||||
/commons-cli/commons-cli = 1.2
|
/commons-cli/commons-cli = 1.2
|
||||||
/commons-codec/commons-codec = 1.9
|
/commons-codec/commons-codec = 1.10
|
||||||
/commons-collections/commons-collections = 3.2.1
|
/commons-collections/commons-collections = 3.2.1
|
||||||
/commons-configuration/commons-configuration = 1.6
|
/commons-configuration/commons-configuration = 1.6
|
||||||
/commons-digester/commons-digester = 2.1
|
/commons-digester/commons-digester = 2.1
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
4b95f4897fa13f2cd904aee711aeafc0c5295cd8
|
|
@ -1 +0,0 @@
|
||||||
9ce04e34240f674bc72680f8b843b1457383161a
|
|
|
@ -0,0 +1 @@
|
||||||
|
4b95f4897fa13f2cd904aee711aeafc0c5295cd8
|
|
@ -1 +0,0 @@
|
||||||
9ce04e34240f674bc72680f8b843b1457383161a
|
|
Loading…
Reference in New Issue