SOLR-5695: Add support for Lucene's BlendedInfixSuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1566222 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Areek Zillur 2014-02-09 05:18:05 +00:00
parent fad3886db8
commit 450e6204f4
7 changed files with 282 additions and 3 deletions

View File

@ -143,6 +143,9 @@ New Features
* SOLR-5631: Add support for Lucene's FreeTextSuggester.
(Areek Zillur via Robert Muir)
* SOLR-5695: Add support for Lucene's BlendedInfixSuggester.
(Areek Zillur)
* SOLR-1301: Add a Solr contrib that allows for building Solr indexes via
Hadoop's MapReduce. (Matt Revelle, Alexander Kanarsky, Steve Rowe,
Mark Miller, Greg Bowyer, Jason Rutherglen, Kris Jirapinyo, Jason Venner ,

View File

@ -37,22 +37,25 @@ public class AnalyzingInfixLookupFactory extends LookupFactory {
/**
* The analyzer used at "query-time" and "build-time" to analyze suggestions.
*/
public static final String QUERY_ANALYZER = "suggestAnalyzerFieldType";
protected static final String QUERY_ANALYZER = "suggestAnalyzerFieldType";
/**
* The path where the underlying index is stored
* if no index is found, it will be generated by
* the AnalyzingInfixSuggester
*/
public static final String INDEX_PATH = "indexPath";
protected static final String INDEX_PATH = "indexPath";
/**
* Minimum number of leading characters before PrefixQuery is used (default 4).
* Prefixes shorter than this are indexed as character ngrams
* (increasing index size but making lookups faster)
*/
private static final String MIN_PREFIX_CHARS = "minPrefixChars";
protected static final String MIN_PREFIX_CHARS = "minPrefixChars";
/**
* Default path where the index for the suggester is stored/loaded from
* */
private static final String DEFAULT_INDEX_PATH = "analyzingInfixSuggesterIndexDir";
/**

View File

@ -0,0 +1,118 @@
package org.apache.solr.spelling.suggest.fst;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
import org.apache.lucene.search.suggest.analyzing.BlendedInfixSuggester;
import org.apache.lucene.search.suggest.analyzing.BlendedInfixSuggester.BlenderType;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
/**
* Factory for {@link BlendedInfixLookupFactory}
* @lucene.experimental
*/
public class BlendedInfixLookupFactory extends AnalyzingInfixLookupFactory {
/**
* Blender type used to calculate weight coefficient using the position
* of the first matching word
* </p>
* Available blender types are: </br>
* linear: weight*(1 - 0.10*position) [default]</br>
* reciprocal: weight/(1+position)
*/
private static final String BLENDER_TYPE = "blenderType";
/**
* Factor to multiply the number of searched elements
* Default is 10
*/
private static final String NUM_FACTOR = "numFactor";
/**
* Default path where the index for the suggester is stored/loaded from
* */
private static final String DEFAULT_INDEX_PATH = "blendedInfixSuggesterIndexDir";
/**
* File name for the automaton.
*/
private static final String FILENAME = "bifsta.bin";
@Override
public Lookup create(NamedList params, SolrCore core) {
// mandatory parameter
Object fieldTypeName = params.get(QUERY_ANALYZER);
if (fieldTypeName == null) {
throw new IllegalArgumentException("Error in configuration: " + QUERY_ANALYZER + " parameter is mandatory");
}
FieldType ft = core.getLatestSchema().getFieldTypeByName(fieldTypeName.toString());
if (ft == null) {
throw new IllegalArgumentException("Error in configuration: " + fieldTypeName.toString() + " is not defined in the schema");
}
Analyzer indexAnalyzer = ft.getAnalyzer();
Analyzer queryAnalyzer = ft.getQueryAnalyzer();
// optional parameters
String indexPath = params.get(INDEX_PATH) != null
? params.get(INDEX_PATH).toString()
: DEFAULT_INDEX_PATH;
int minPrefixChars = params.get(MIN_PREFIX_CHARS) != null
? Integer.parseInt(params.get(MIN_PREFIX_CHARS).toString())
: AnalyzingInfixSuggester.DEFAULT_MIN_PREFIX_CHARS;
BlenderType blenderType = getBlenderType(params.get(BLENDER_TYPE));
int numFactor = params.get(NUM_FACTOR) != null
? Integer.parseInt(params.get(NUM_FACTOR).toString())
: BlendedInfixSuggester.DEFAULT_NUM_FACTOR;
try {
return new BlendedInfixSuggester(core.getSolrConfig().luceneMatchVersion,
new File(indexPath), indexAnalyzer, queryAnalyzer, minPrefixChars, blenderType, numFactor);
} catch (IOException e) {
throw new RuntimeException();
}
}
@Override
public String storeFileName() {
return FILENAME;
}
private BlenderType getBlenderType(Object blenderTypeParam) {
BlenderType blenderType = BlenderType.POSITION_LINEAR;
if (blenderTypeParam != null) {
String blenderTypeStr = blenderTypeParam.toString();
if (blenderTypeStr.equalsIgnoreCase("reciprocal")) {
blenderType = BlenderType.POSITION_RECIPROCAL;
}
}
return blenderType;
}
}

View File

@ -0,0 +1,3 @@
top of the lake 18 lake
star wars: episode v - the empire strikes back 12 star
the returned 10 ret

View File

@ -118,6 +118,34 @@
</lst>
</searchComponent>
<searchComponent class="solr.SuggestComponent" name="blended_infix_suggest">
<lst name="suggester">
<str name="name">blended_infix_suggest_linear</str>
<str name="lookupImpl">BlendedInfixLookupFactory</str>
<str name="dictionaryImpl">FileDictionaryFactory</str>
<str name="buildOnCommit">false</str>
<str name="sourceLocation">blendedInfixSuggest.txt</str>
<!-- Suggester properties -->
<str name="blenderType">linear</str>
<str name="suggestAnalyzerFieldType">text</str>
</lst>
<lst name="suggester">
<str name="name">blended_infix_suggest_reciprocal</str>
<str name="lookupImpl">BlendedInfixLookupFactory</str>
<str name="dictionaryImpl">FileDictionaryFactory</str>
<str name="buildOnCommit">false</str>
<str name="sourceLocation">blendedInfixSuggest.txt</str>
<!-- Suggester properties -->
<str name="blenderType">reciprocal</str>
<str name="suggestAnalyzerFieldType">text</str>
</lst>
</searchComponent>
<!-- FuzzyLookup suggest component with FileDictionaryFactory -->
<searchComponent class="solr.SuggestComponent" name="fuzzy_suggest_analyzing_with_file_dict">
<lst name="suggester">
@ -277,6 +305,15 @@
</requestHandler>
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/blended_infix_suggest">
<lst name="defaults">
<str name="suggest">true</str>
</lst>
<arr name="components">
<str>blended_infix_suggest</str>
</arr>
</requestHandler>
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/free_text_suggest">
<lst name="defaults">
<str name="suggest">true</str>

View File

@ -1,7 +1,10 @@
package org.apache.solr.spelling.suggest;
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SpellingParams;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/*
@ -30,6 +33,16 @@ public class TestAnalyzeInfixSuggestions extends SolrTestCaseJ4 {
assertQ(req("qt", URI_DEFAULT, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
}
@AfterClass
public static void afterClass() throws Exception {
File indexPathDir = new File("analyzingInfixSuggesterIndexDir");
File indexPathDirTmp = new File("analyzingInfixSuggesterIndexDir.tmp");
if (indexPathDir.exists())
assertTrue(recurseDelete(indexPathDir));
if (indexPathDirTmp.exists())
assertTrue(recurseDelete(indexPathDirTmp));
}
public void testSingle() throws Exception {
assertQ(req("qt", URI_DEFAULT, "q", "japan", SpellingParams.SPELLCHECK_COUNT, "1"),
@ -63,4 +76,5 @@ public class TestAnalyzeInfixSuggestions extends SolrTestCaseJ4 {
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='japan']/arr[@name='suggestion']/str[3][.='Add decompose compound <b>Japan</b>ese Katakana token capability to Kuromoji']"
);
}
}

View File

@ -0,0 +1,101 @@
package org.apache.solr.spelling.suggest;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestBlendedInfixSuggestions extends SolrTestCaseJ4 {
static final String URI = "/blended_infix_suggest";
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
assertQ(req("qt", URI, "q", "", SuggesterParams.SUGGEST_BUILD_ALL, "true"));
}
@AfterClass
public static void afterClass() throws Exception {
File indexPathDir = new File("blendedInfixSuggesterIndexDir");
File indexPathDirTmp = new File("blendedInfixSuggesterIndexDir.tmp");
if (indexPathDir.exists())
assertTrue(recurseDelete(indexPathDir));
if (indexPathDirTmp.exists())
assertTrue(recurseDelete(indexPathDirTmp));
}
public void testLinearBlenderType() {
assertQ(req("qt", URI, "q", "the", SuggesterParams.SUGGEST_COUNT, "10", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_linear"),
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/int[@name='numFound'][.='3']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='top of <b>the</b> lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='14']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='<b>the</b> returned']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='10']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='ret']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - <b>the</b> empire strikes back']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='7']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']"
);
}
public void testReciprocalBlenderType() {
assertQ(req("qt", URI, "q", "the", SuggesterParams.SUGGEST_COUNT, "10", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_reciprocal"),
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/int[@name='numFound'][.='3']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='<b>the</b> returned']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='10']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='ret']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='top of <b>the</b> lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='6']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - <b>the</b> empire strikes back']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='2']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']"
);
}
public void testMultiSuggester() {
assertQ(req("qt", URI, "q", "the", SuggesterParams.SUGGEST_COUNT, "10", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_linear", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_reciprocal"),
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/int[@name='numFound'][.='3']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='top of <b>the</b> lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='14']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='<b>the</b> returned']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='10']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='ret']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - <b>the</b> empire strikes back']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='7']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/int[@name='numFound'][.='3']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='<b>the</b> returned']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='10']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='ret']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='top of <b>the</b> lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='6']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='lake']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - <b>the</b> empire strikes back']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='2']",
"//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']"
);
}
}