SOLR-4084: add factory for FuzzySuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1410908 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-11-18 15:50:01 +00:00
parent a7dfbe02f6
commit cb90f42bee
6 changed files with 394 additions and 1 deletions

View File

@ -89,6 +89,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
* suggestions.
*/
public static final int DEFAULT_MAX_EDITS = 1;
/**
* The default transposition value passed to {@link LevenshteinAutomata}
*/
public static final boolean DEFAULT_TRANSPOSITIONS = true;
/**
* Creates a {@link FuzzySuggester} instance initialized with default values.
@ -108,7 +113,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
* Analyzer that will be used for analyzing query text during lookup
*/
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
}

View File

@ -67,6 +67,9 @@ New Features
that the "root" locale is default. Previously it was the machine's default locale.
(James Dyer)
* SOLR-4084: Add FuzzyLookupFactory, which is like AnalyzingSuggester except that
it can tolerate typos in the input. (Areek Zillur via Robert Muir)
Optimizations
----------------------

View File

@ -0,0 +1,126 @@
package org.apache.solr.spelling.suggest.fst;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.spelling.suggest.LookupFactory;
/**
* Factory for {@link FuzzySuggester}
* @lucene.experimental
*/
public class FuzzyLookupFactory extends LookupFactory {
/**
* Maximum number of edits allowed, used by {@link LevenshteinAutomata#toAutomaton(int)}
*/
public static final String MAX_EDITS = "maxEdits";
/**
* If transpositions are allowed, Fuzzy suggestions will be computed based on a primitive
* edit operation. If it is false, it will be based on the classic Levenshtein algorithm.
*/
public static final String TRANSPOSITIONS = "transpositions";
/**
* Length of common (non-fuzzy) prefix for the suggestions
*/
public static final String NON_FUZZY_PREFIX = "nonFuzzyPrefix";
/**
* Minimum length of lookup key before any edits are allowed for the suggestions
*/
public static final String MIN_FUZZY_LENGTH = "minFuzzyLength";
/**
* File name for the automaton.
*/
private static final String FILENAME = "fwfsta.bin";
@Override
public Lookup create(NamedList params, SolrCore core) {
// mandatory parameter
Object fieldTypeName = params.get(AnalyzingLookupFactory.QUERY_ANALYZER);
if (fieldTypeName == null) {
throw new IllegalArgumentException("Error in configuration: " + AnalyzingLookupFactory.QUERY_ANALYZER + " parameter is mandatory");
}
// retrieve index and query analyzers for the field
FieldType ft = core.getSchema().getFieldTypeByName(fieldTypeName.toString());
Analyzer indexAnalyzer = ft.getAnalyzer();
Analyzer queryAnalyzer = ft.getQueryAnalyzer();
// optional parameters
boolean exactMatchFirst = (params.get(AnalyzingLookupFactory.EXACT_MATCH_FIRST) != null)
? Boolean.valueOf(params.get(AnalyzingLookupFactory.EXACT_MATCH_FIRST).toString())
: true;
boolean preserveSep = (params.get(AnalyzingLookupFactory.PRESERVE_SEP) != null)
? Boolean.valueOf(params.get(AnalyzingLookupFactory.PRESERVE_SEP).toString())
: true;
int options = 0;
if (exactMatchFirst) {
options |= FuzzySuggester.EXACT_FIRST;
}
if (preserveSep) {
options |= FuzzySuggester.PRESERVE_SEP;
}
int maxSurfaceFormsPerAnalyzedForm = (params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS) != null)
? Integer.parseInt(params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS).toString())
: 256;
int maxGraphExpansions = (params.get(AnalyzingLookupFactory.MAX_EXPANSIONS) != null)
? Integer.parseInt(params.get(AnalyzingLookupFactory.MAX_EXPANSIONS).toString())
: -1;
int maxEdits = (params.get(MAX_EDITS) != null)
? Integer.parseInt(params.get(MAX_EDITS).toString())
: FuzzySuggester.DEFAULT_MAX_EDITS;
boolean transpositions = (params.get(TRANSPOSITIONS) != null)
? Boolean.parseBoolean(params.get(TRANSPOSITIONS).toString())
: FuzzySuggester.DEFAULT_TRANSPOSITIONS;
int nonFuzzyPrefix = (params.get(NON_FUZZY_PREFIX) != null)
? Integer.parseInt(params.get(NON_FUZZY_PREFIX).toString())
:FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
int minFuzzyLength = (params.get(MIN_FUZZY_LENGTH) != null)
? Integer.parseInt(params.get(MIN_FUZZY_LENGTH).toString())
:FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
return new FuzzySuggester(indexAnalyzer, queryAnalyzer, options,
maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, maxEdits,
transpositions, nonFuzzyPrefix, minFuzzyLength);
}
@Override
public String storeFileName() {
return FILENAME;
}
}

View File

@ -0,0 +1,4 @@
# simple fuzzy suggest phrase dictionary for testing
change 1.0
charge 1.0
chance 1.0

View File

@ -64,6 +64,93 @@
<str name="queryAnalyzerFieldType">phrase_suggest</str>
</searchComponent>
<!-- FuzzyLookup suggest component (default)-->
<searchComponent class="solr.SpellCheckComponent" name="fuzzy_suggest_analyzing">
<lst name="spellchecker">
<str name="name">fuzzy_suggest_analyzing</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FuzzyLookupFactory</str>
<str name="storeDir">fuzzy_suggest_analyzing</str>
<str name="buildOnCommit">false</str>
<!-- Suggester properties -->
<bool name="exactMatchFirst">true</bool>
<str name="suggestAnalyzerFieldType">text</str>
<bool name="preserveSep">false</bool>
<str name="sourceLocation">fuzzysuggest.txt</str>
</lst>
<!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
<str name="queryAnalyzerFieldType">phrase_suggest</str>
</searchComponent>
<!-- FuzzyLookup suggest component with max edit 2-->
<searchComponent class="solr.SpellCheckComponent" name="fuzzy_suggest_analyzing_with_max_edit_2">
<lst name="spellchecker">
<str name="name">fuzzy_suggest_analyzing_with_max_edit_2</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FuzzyLookupFactory</str>
<str name="storeDir">fuzzy_suggest_analyzing_with_max_edit_2</str>
<str name="buildOnCommit">false</str>
<!-- Suggester properties -->
<bool name="exactMatchFirst">true</bool>
<str name="suggestAnalyzerFieldType">text</str>
<bool name="preserveSep">false</bool>
<int name="maxEdits">2</int>
<str name="sourceLocation">fuzzysuggest.txt</str>
</lst>
<!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
<str name="queryAnalyzerFieldType">phrase_suggest</str>
</searchComponent>
<!-- FuzzyLookup suggest component with 4 non_fuzzy_prefix -->
<searchComponent class="solr.SpellCheckComponent" name="fuzzy_suggest_analyzing_with_non_fuzzy_prefix_4">
<lst name="spellchecker">
<str name="name">fuzzy_suggest_analyzing_with_non_fuzzy_prefix_4</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FuzzyLookupFactory</str>
<str name="storeDir">fuzzy_suggest_analyzing_with_non_fuzzy_prefix_4</str>
<str name="buildOnCommit">false</str>
<!-- Suggester properties -->
<bool name="exactMatchFirst">true</bool>
<str name="suggestAnalyzerFieldType">text</str>
<bool name="preserveSep">false</bool>
<int name="nonFuzzyPrefix">4</int>
<str name="sourceLocation">fuzzysuggest.txt</str>
</lst>
<!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
<str name="queryAnalyzerFieldType">phrase_suggest</str>
</searchComponent>
<!-- FuzzyLookup suggest component with 2 min_fuzzy_length -->
<searchComponent class="solr.SpellCheckComponent" name="fuzzy_suggest_analyzing_with_min_fuzzy_length_2">
<lst name="spellchecker">
<str name="name">fuzzy_suggest_analyzing_with_min_fuzzy_length_2</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FuzzyLookupFactory</str>
<str name="storeDir">fuzzy_suggest_analyzing_with_min_fuzzy_length_2</str>
<str name="buildOnCommit">false</str>
<!-- Suggester properties -->
<bool name="exactMatchFirst">true</bool>
<str name="suggestAnalyzerFieldType">text</str>
<bool name="preserveSep">false</bool>
<int name="minFuzzyLength">2</int>
<str name="sourceLocation">fuzzysuggest.txt</str>
</lst>
<!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
<str name="queryAnalyzerFieldType">phrase_suggest</str>
</searchComponent>
<!-- is this thing just configured globally or wtf is going on here?! -->
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
@ -95,4 +182,59 @@
</arr>
</requestHandler>
<!-- Fuzzy analyzing handler with 1 max edit -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/fuzzy_suggest_analyzing">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">fuzzy_suggest_analyzing</str>
<str name="spellcheck.collate">false</str>
<!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
<str name="spellcheck.onlyMorePopular">true</str>
</lst>
<arr name="components">
<str>fuzzy_suggest_analyzing</str>
</arr>
</requestHandler>
<!-- Fuzzy analyzing handler with 2 max edit -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/fuzzy_suggest_analyzing_with_max_edit_2">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">fuzzy_suggest_analyzing_with_max_edit_2</str>
<str name="spellcheck.collate">false</str>
<!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
<str name="spellcheck.onlyMorePopular">true</str>
</lst>
<arr name="components">
<str>fuzzy_suggest_analyzing_with_max_edit_2</str>
</arr>
</requestHandler>
<!-- Fuzzy analyzing handler with 4 non_fuzzy_prefix -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/fuzzy_suggest_analyzing_with_non_fuzzy_prefix_4">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">fuzzy_suggest_analyzing_with_non_fuzzy_prefix_4</str>
<str name="spellcheck.collate">false</str>
<!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
<str name="spellcheck.onlyMorePopular">true</str>
</lst>
<arr name="components">
<str>fuzzy_suggest_analyzing_with_non_fuzzy_prefix_4</str>
</arr>
</requestHandler>
<!-- Fuzzy analyzing handler with 2 min_fuzzy_length -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/fuzzy_suggest_analyzing_with_min_fuzzy_length_2">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">fuzzy_suggest_analyzing_with_min_fuzzy_length_2</str>
<str name="spellcheck.collate">false</str>
<!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
<str name="spellcheck.onlyMorePopular">true</str>
</lst>
<arr name="components">
<str>fuzzy_suggest_analyzing_with_min_fuzzy_length_2</str>
</arr>
</requestHandler>
</config>

View File

@ -0,0 +1,113 @@
package org.apache.solr.spelling.suggest;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SpellingParams;
import org.junit.BeforeClass;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestFuzzyAnalyzedSuggestions extends SolrTestCaseJ4 {
static final String URI_DEFAULT = "/fuzzy_suggest_analyzing";
static final String URI_MIN_EDIT_2 = "/fuzzy_suggest_analyzing_with_max_edit_2";
static final String URI_NON_PREFIX_LENGTH_4 = "/fuzzy_suggest_analyzing_with_non_fuzzy_prefix_4";
static final String URI_MIN_FUZZY_LENGTH = "/fuzzy_suggest_analyzing_with_min_fuzzy_length_2";
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
// Suggestions text include : change, charge, chance
assertQ(req("qt", URI_DEFAULT, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
assertQ(req("qt", URI_MIN_EDIT_2, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
assertQ(req("qt", URI_NON_PREFIX_LENGTH_4, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
assertQ(req("qt", URI_MIN_FUZZY_LENGTH, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
}
public void testDefault() throws Exception {
// tests to demonstrate default maxEdit parameter (value: 1), control for testWithMaxEdit2
assertQ(req("qt", URI_DEFAULT, "q", "chagn", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagn']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagn']/arr[@name='suggestion']/str[1][.='chance']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagn']/arr[@name='suggestion']/str[2][.='change']"
);
assertQ(req("qt", URI_DEFAULT, "q", "chacn", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chacn']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chacn']/arr[@name='suggestion']/str[1][.='chance']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chacn']/arr[@name='suggestion']/str[2][.='change']"
);
assertQ(req("qt", URI_DEFAULT, "q", "chagr", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagr']/int[@name='numFound'][.='1']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagr']/arr[@name='suggestion']/str[1][.='charge']"
);
// test to demonstrate default nonFuzzyPrefix parameter (value: 1), control for testWithNonFuzzyPrefix4
assertQ(req("qt", URI_DEFAULT, "q", "chanr", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chanr']/int[@name='numFound'][.='3']"
);
// test to demonstrate default minFuzzyPrefix parameter (value: 3), control for testWithMinFuzzyLength2
assertQ(req("qt", URI_DEFAULT, "q", "cyhnce", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions'][not(node())]"
);
}
public void testWithMaxEdit2() throws Exception {
assertQ(req("qt", URI_MIN_EDIT_2, "q", "chagn", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagn']/int[@name='numFound'][.='3']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagn']/arr[@name='suggestion']/str[1][.='chance']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagn']/arr[@name='suggestion']/str[2][.='change']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagn']/arr[@name='suggestion']/str[3][.='charge']"
);
assertQ(req("qt", URI_MIN_EDIT_2, "q", "chagr", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagr']/int[@name='numFound'][.='3']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagr']/arr[@name='suggestion']/str[1][.='chance']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagr']/arr[@name='suggestion']/str[2][.='change']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chagr']/arr[@name='suggestion']/str[3][.='charge']"
);
assertQ(req("qt", URI_MIN_EDIT_2, "q", "chacn", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chacn']/int[@name='numFound'][.='3']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chacn']/arr[@name='suggestion']/str[1][.='chance']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chacn']/arr[@name='suggestion']/str[2][.='change']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chacn']/arr[@name='suggestion']/str[3][.='charge']"
);
}
public void testWithNonFuzzyPrefix4() throws Exception {
// This test should not match charge, as the nonFuzzyPrefix has been set to 4
assertQ(req("qt", URI_NON_PREFIX_LENGTH_4, "q", "chanr", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chanr']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chanr']/arr[@name='suggestion']/str[1][.='chance']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chanr']/arr[@name='suggestion']/str[2][.='change']"
);
}
public void testWithMinFuzzyLength2() throws Exception {
// This test should match chance as the minFuzzyLength parameter has been set to 2
assertQ(req("qt", URI_MIN_FUZZY_LENGTH, "q", "chynce", SpellingParams.SPELLCHECK_COUNT, "3"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chynce']/int[@name='numFound'][.='1']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='chynce']/arr[@name='suggestion']/str[1][.='chance']"
);
}
}