mirror of https://github.com/apache/lucene.git
SOLR-2982: add Beider-Morse phonetic filter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1225211 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
21822811a9
commit
f3869ef3ce
|
@ -87,7 +87,7 @@
|
|||
<classpathentry kind="lib" path="lucene/lib/junit-4.7.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/contrib/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/icu/lib/icu4j-4_8_1_1.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.5.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-fsa-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-polish-1.5.2.jar"/>
|
||||
<classpathentry kind="lib" path="modules/analysis/morfologik/lib/morfologik-stemming-1.5.2.jar"/>
|
||||
|
@ -98,7 +98,6 @@
|
|||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1209632.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-codec-1.5.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-csv-1.0-SNAPSHOT-r966014.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-httpclient-3.1.jar"/>
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[ee1bc49acae11cc79eceec51f7be785590e99fd8] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,5 +1,5 @@
|
|||
Apache Commons Codec
|
||||
Copyright 2002-2009 The Apache Software Foundation
|
||||
Copyright 2002-2011 The Apache Software Foundation
|
||||
|
||||
This product includes software developed by
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.codec.language.bm.BeiderMorseEncoder; // javadocs
|
||||
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
|
||||
import org.apache.commons.codec.language.bm.PhoneticEngine;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
* TokenFilter for Beider-Morse phonetic encoding.
|
||||
* @see BeiderMorseEncoder
|
||||
*/
|
||||
public final class BeiderMorseFilter extends TokenFilter {
|
||||
private final PhoneticEngine engine;
|
||||
private final LanguageSet languages;
|
||||
|
||||
// output is a string such as ab|ac|...
|
||||
// in complex cases like d'angelo its (anZelo|andZelo|...)-(danZelo|...)
|
||||
// if there are multiple 's, it starts to nest...
|
||||
private static final Pattern pattern = Pattern.compile("([^()|-]+)");
|
||||
|
||||
// matcher over any buffered output
|
||||
private final Matcher matcher = pattern.matcher("");
|
||||
// encoded representation
|
||||
private String encoded;
|
||||
// offsets for any buffered outputs
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Calls {@link #BeiderMorseFilter(TokenStream, PhoneticEngine, LanguageSet)
|
||||
* BeiderMorseFilter(input, engine, null)}
|
||||
*/
|
||||
public BeiderMorseFilter(TokenStream input, PhoneticEngine engine) {
|
||||
this(input, engine, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new BeiderMorseFilter
|
||||
* @param input TokenStream to filter
|
||||
* @param engine configured PhoneticEngine with BM settings.
|
||||
* @param languages optional Set of original languages. Can be null (which means it will be guessed).
|
||||
*/
|
||||
public BeiderMorseFilter(TokenStream input, PhoneticEngine engine, LanguageSet languages) {
|
||||
super(input);
|
||||
this.engine = engine;
|
||||
this.languages = languages;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (matcher.find()) {
|
||||
clearAttributes();
|
||||
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
offsetAtt.setOffset(startOffset, endOffset);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (input.incrementToken()) {
|
||||
encoded = (languages == null)
|
||||
? engine.encode(termAtt.toString())
|
||||
: engine.encode(termAtt.toString(), languages);
|
||||
startOffset = offsetAtt.startOffset();
|
||||
endOffset = offsetAtt.endOffset();
|
||||
matcher.reset(encoded);
|
||||
if (matcher.find()) {
|
||||
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
matcher.reset("");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.analysis.phonetic;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.commons.codec.language.bm.NameType;
|
||||
import org.apache.commons.codec.language.bm.PhoneticEngine;
|
||||
import org.apache.commons.codec.language.bm.RuleType;
|
||||
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/** Tests {@link BeiderMorseFilter} */
|
||||
public class TestBeiderMorseFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new BeiderMorseFilter(tokenizer, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true)));
|
||||
}
|
||||
};
|
||||
|
||||
/** generic, "exact" configuration */
|
||||
public void testBasicUsage() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "Angelo",
|
||||
new String[] { "anZelo", "andZelo", "angelo", "anhelo", "anjelo", "anxelo" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 6, 6, 6, 6, 6, 6 },
|
||||
new int[] { 1, 0, 0, 0, 0, 0 });
|
||||
|
||||
assertAnalyzesTo(analyzer, "D'Angelo",
|
||||
new String[] { "anZelo", "andZelo", "angelo", "anhelo", "anjelo", "anxelo",
|
||||
"danZelo", "dandZelo", "dangelo", "danhelo", "danjelo", "danxelo" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 },
|
||||
new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
|
||||
}
|
||||
|
||||
/** restrict the output to a set of possible origin languages */
|
||||
public void testLanguageSet() throws Exception {
|
||||
final LanguageSet languages = LanguageSet.from(new HashSet<String>() {{
|
||||
add("italian"); add("greek"); add("spanish");
|
||||
}});
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new BeiderMorseFilter(tokenizer,
|
||||
new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true), languages));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(analyzer, "Angelo",
|
||||
new String[] { "andZelo", "angelo", "anxelo" },
|
||||
new int[] { 0, 0, 0, },
|
||||
new int[] { 6, 6, 6, },
|
||||
new int[] { 1, 0, 0, });
|
||||
}
|
||||
|
||||
/** for convenience, if the input yields no output, we pass it thru as-is */
|
||||
public void testNumbers() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "1234",
|
||||
new String[] { "1234" },
|
||||
new int[] { 0 },
|
||||
new int[] { 4 },
|
||||
new int[] { 1 });
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
checkRandomData(random, analyzer, 1000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
|
||||
import org.apache.commons.codec.language.bm.NameType;
|
||||
import org.apache.commons.codec.language.bm.PhoneticEngine;
|
||||
import org.apache.commons.codec.language.bm.RuleType;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
|
||||
|
||||
/**
|
||||
* Factory for {@link BeiderMorseFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_bm" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
* <filter class="solr.BeiderMorseFilterFactory"
|
||||
* nameType="GENERIC" ruleType="APPROX"
|
||||
* concat="true" languageSet="auto"
|
||||
* </filter>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
*/
|
||||
public class BeiderMorseFilterFactory extends BaseTokenFilterFactory {
|
||||
private PhoneticEngine engine;
|
||||
private LanguageSet languageSet;
|
||||
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
|
||||
// PhoneticEngine = NameType + RuleType + concat
|
||||
// we use common-codec's defaults: GENERIC + APPROX + true
|
||||
String nameTypeArg = args.get("nameType");
|
||||
NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
|
||||
|
||||
String ruleTypeArg = args.get("ruleType");
|
||||
RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
|
||||
|
||||
boolean concat = getBoolean("concat", true);
|
||||
engine = new PhoneticEngine(nameType, ruleType, concat);
|
||||
|
||||
// LanguageSet: defaults to automagic, otherwise a comma-separated list.
|
||||
String languageSetArg = args.get("languageSet");
|
||||
if (languageSetArg == null || languageSetArg.equals("auto")) {
|
||||
languageSet = null;
|
||||
} else {
|
||||
languageSet = LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new BeiderMorseFilter(input, engine, languageSet);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/** Simple tests for {@link BeiderMorseFilterFactory} */
|
||||
public class TestBeiderMorseFilterFactory extends BaseTokenTestCase {
|
||||
public void testBasics() throws Exception {
|
||||
BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 8, 8, 8, 8, 8, 8 },
|
||||
new int[] { 1, 0, 0, 0, 0, 0 });
|
||||
}
|
||||
|
||||
public void testLanguageSet() throws Exception {
|
||||
BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("languageSet", "polish");
|
||||
factory.init(args);
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "vDmbYrk", "vDmbirk", "vambYrk", "vambirk", "vimbYrk", "vimbirk" },
|
||||
new int[] { 0, 0, 0, 0, 0, 0 },
|
||||
new int[] { 8, 8, 8, 8, 8, 8 },
|
||||
new int[] { 1, 0, 0, 0, 0, 0 });
|
||||
}
|
||||
|
||||
public void testOptions() throws Exception {
|
||||
BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("nameType", "ASHKENAZI");
|
||||
args.put("ruleType", "EXACT");
|
||||
factory.init(args);
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "vajnberk" },
|
||||
new int[] { 0 },
|
||||
new int[] { 8 },
|
||||
new int[] { 1 });
|
||||
}
|
||||
}
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[e9013fed78f333c928ff7f828948b91fcb5a92b4] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[ee1bc49acae11cc79eceec51f7be785590e99fd8] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,5 +1,5 @@
|
|||
Apache Commons Codec
|
||||
Copyright 2002-2009 The Apache Software Foundation
|
||||
Copyright 2002-2011 The Apache Software Foundation
|
||||
|
||||
This product includes software developed by
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
|
|
Loading…
Reference in New Issue