diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 21160aec499..89bf8b3229b 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -143,6 +143,9 @@ New Features * SOLR-5631: Add support for Lucene's FreeTextSuggester. (Areek Zillur via Robert Muir) +* SOLR-5695: Add support for Lucene's BlendedInfixSuggester. + (Areek Zillur) + * SOLR-1301: Add a Solr contrib that allows for building Solr indexes via Hadoop's MapReduce. (Matt Revelle, Alexander Kanarsky, Steve Rowe, Mark Miller, Greg Bowyer, Jason Rutherglen, Kris Jirapinyo, Jason Venner , diff --git a/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingInfixLookupFactory.java b/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingInfixLookupFactory.java index e8196413cc6..f09c089d743 100644 --- a/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingInfixLookupFactory.java +++ b/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingInfixLookupFactory.java @@ -37,22 +37,25 @@ public class AnalyzingInfixLookupFactory extends LookupFactory { /** * The analyzer used at "query-time" and "build-time" to analyze suggestions. */ - public static final String QUERY_ANALYZER = "suggestAnalyzerFieldType"; + protected static final String QUERY_ANALYZER = "suggestAnalyzerFieldType"; /** * The path where the underlying index is stored * if no index is found, it will be generated by * the AnalyzingInfixSuggester */ - public static final String INDEX_PATH = "indexPath"; + protected static final String INDEX_PATH = "indexPath"; /** * Minimum number of leading characters before PrefixQuery is used (default 4). * Prefixes shorter than this are indexed as character ngrams * (increasing index size but making lookups faster) */ - private static final String MIN_PREFIX_CHARS = "minPrefixChars"; + protected static final String MIN_PREFIX_CHARS = "minPrefixChars"; + /** + * Default path where the index for the suggester is stored/loaded from + * */ private static final String DEFAULT_INDEX_PATH = "analyzingInfixSuggesterIndexDir"; /** diff --git a/solr/core/src/java/org/apache/solr/spelling/suggest/fst/BlendedInfixLookupFactory.java b/solr/core/src/java/org/apache/solr/spelling/suggest/fst/BlendedInfixLookupFactory.java new file mode 100644 index 00000000000..1662913c694 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/spelling/suggest/fst/BlendedInfixLookupFactory.java @@ -0,0 +1,118 @@ +package org.apache.solr.spelling.suggest.fst; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester; +import org.apache.lucene.search.suggest.analyzing.BlendedInfixSuggester; +import org.apache.lucene.search.suggest.analyzing.BlendedInfixSuggester.BlenderType; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.schema.FieldType; + +/** + * Factory for {@link BlendedInfixLookupFactory} + * @lucene.experimental + */ +public class BlendedInfixLookupFactory extends AnalyzingInfixLookupFactory { + + /** + * Blender type used to calculate weight coefficient using the position + * of the first matching word + *

+ * Available blender types are:
+ * linear: weight*(1 - 0.10*position) [default]
+ * reciprocal: weight/(1+position) + */ + private static final String BLENDER_TYPE = "blenderType"; + + /** + * Factor to multiply the number of searched elements + * Default is 10 + */ + private static final String NUM_FACTOR = "numFactor"; + + /** + * Default path where the index for the suggester is stored/loaded from + * */ + private static final String DEFAULT_INDEX_PATH = "blendedInfixSuggesterIndexDir"; + + /** + * File name for the automaton. + */ + private static final String FILENAME = "bifsta.bin"; + + + @Override + public Lookup create(NamedList params, SolrCore core) { + // mandatory parameter + Object fieldTypeName = params.get(QUERY_ANALYZER); + if (fieldTypeName == null) { + throw new IllegalArgumentException("Error in configuration: " + QUERY_ANALYZER + " parameter is mandatory"); + } + FieldType ft = core.getLatestSchema().getFieldTypeByName(fieldTypeName.toString()); + if (ft == null) { + throw new IllegalArgumentException("Error in configuration: " + fieldTypeName.toString() + " is not defined in the schema"); + } + Analyzer indexAnalyzer = ft.getAnalyzer(); + Analyzer queryAnalyzer = ft.getQueryAnalyzer(); + + // optional parameters + + String indexPath = params.get(INDEX_PATH) != null + ? params.get(INDEX_PATH).toString() + : DEFAULT_INDEX_PATH; + + int minPrefixChars = params.get(MIN_PREFIX_CHARS) != null + ? Integer.parseInt(params.get(MIN_PREFIX_CHARS).toString()) + : AnalyzingInfixSuggester.DEFAULT_MIN_PREFIX_CHARS; + + BlenderType blenderType = getBlenderType(params.get(BLENDER_TYPE)); + + int numFactor = params.get(NUM_FACTOR) != null + ? Integer.parseInt(params.get(NUM_FACTOR).toString()) + : BlendedInfixSuggester.DEFAULT_NUM_FACTOR; + + try { + return new BlendedInfixSuggester(core.getSolrConfig().luceneMatchVersion, + new File(indexPath), indexAnalyzer, queryAnalyzer, minPrefixChars, blenderType, numFactor); + } catch (IOException e) { + throw new RuntimeException(); + } + } + + @Override + public String storeFileName() { + return FILENAME; + } + + private BlenderType getBlenderType(Object blenderTypeParam) { + BlenderType blenderType = BlenderType.POSITION_LINEAR; + if (blenderTypeParam != null) { + String blenderTypeStr = blenderTypeParam.toString(); + if (blenderTypeStr.equalsIgnoreCase("reciprocal")) { + blenderType = BlenderType.POSITION_RECIPROCAL; + } + } + return blenderType; + } +} diff --git a/solr/core/src/test-files/solr/collection1/conf/blendedInfixSuggest.txt b/solr/core/src/test-files/solr/collection1/conf/blendedInfixSuggest.txt new file mode 100644 index 00000000000..c3b3d340bb7 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/blendedInfixSuggest.txt @@ -0,0 +1,3 @@ +top of the lake 18 lake +star wars: episode v - the empire strikes back 12 star +the returned 10 ret diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml index 60033d9f755..74a27596e22 100644 --- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml @@ -118,6 +118,34 @@ + + + + blended_infix_suggest_linear + BlendedInfixLookupFactory + FileDictionaryFactory + false + blendedInfixSuggest.txt + + + linear + text + + + + blended_infix_suggest_reciprocal + BlendedInfixLookupFactory + FileDictionaryFactory + false + blendedInfixSuggest.txt + + + reciprocal + text + + + + @@ -277,6 +305,15 @@ + + + true + + + blended_infix_suggest + + + true diff --git a/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzeInfixSuggestions.java b/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzeInfixSuggestions.java index 0ee3e583356..0e076ef3268 100644 --- a/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzeInfixSuggestions.java +++ b/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzeInfixSuggestions.java @@ -1,7 +1,10 @@ package org.apache.solr.spelling.suggest; +import java.io.File; + import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.SpellingParams; +import org.junit.AfterClass; import org.junit.BeforeClass; /* @@ -30,6 +33,16 @@ public class TestAnalyzeInfixSuggestions extends SolrTestCaseJ4 { assertQ(req("qt", URI_DEFAULT, "q", "", SpellingParams.SPELLCHECK_BUILD, "true")); } + @AfterClass + public static void afterClass() throws Exception { + File indexPathDir = new File("analyzingInfixSuggesterIndexDir"); + File indexPathDirTmp = new File("analyzingInfixSuggesterIndexDir.tmp"); + if (indexPathDir.exists()) + assertTrue(recurseDelete(indexPathDir)); + if (indexPathDirTmp.exists()) + assertTrue(recurseDelete(indexPathDirTmp)); + } + public void testSingle() throws Exception { assertQ(req("qt", URI_DEFAULT, "q", "japan", SpellingParams.SPELLCHECK_COUNT, "1"), @@ -63,4 +76,5 @@ public class TestAnalyzeInfixSuggestions extends SolrTestCaseJ4 { "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='japan']/arr[@name='suggestion']/str[3][.='Add decompose compound Japanese Katakana token capability to Kuromoji']" ); } + } \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/spelling/suggest/TestBlendedInfixSuggestions.java b/solr/core/src/test/org/apache/solr/spelling/suggest/TestBlendedInfixSuggestions.java new file mode 100644 index 00000000000..ee2c93cccc0 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/spelling/suggest/TestBlendedInfixSuggestions.java @@ -0,0 +1,101 @@ +package org.apache.solr.spelling.suggest; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; + +import org.apache.solr.SolrTestCaseJ4; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +public class TestBlendedInfixSuggestions extends SolrTestCaseJ4 { + static final String URI = "/blended_infix_suggest"; + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml"); + assertQ(req("qt", URI, "q", "", SuggesterParams.SUGGEST_BUILD_ALL, "true")); + } + + @AfterClass + public static void afterClass() throws Exception { + File indexPathDir = new File("blendedInfixSuggesterIndexDir"); + File indexPathDirTmp = new File("blendedInfixSuggesterIndexDir.tmp"); + if (indexPathDir.exists()) + assertTrue(recurseDelete(indexPathDir)); + if (indexPathDirTmp.exists()) + assertTrue(recurseDelete(indexPathDirTmp)); + } + + public void testLinearBlenderType() { + assertQ(req("qt", URI, "q", "the", SuggesterParams.SUGGEST_COUNT, "10", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_linear"), + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/int[@name='numFound'][.='3']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='top of the lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='14']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='the returned']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='10']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='ret']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - the empire strikes back']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='7']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']" + ); + + } + + public void testReciprocalBlenderType() { + assertQ(req("qt", URI, "q", "the", SuggesterParams.SUGGEST_COUNT, "10", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_reciprocal"), + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/int[@name='numFound'][.='3']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='the returned']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='10']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='ret']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='top of the lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='6']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - the empire strikes back']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='2']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']" + ); + } + + public void testMultiSuggester() { + assertQ(req("qt", URI, "q", "the", SuggesterParams.SUGGEST_COUNT, "10", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_linear", SuggesterParams.SUGGEST_DICT, "blended_infix_suggest_reciprocal"), + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/int[@name='numFound'][.='3']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='top of the lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='14']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='the returned']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='10']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='ret']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - the empire strikes back']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='7']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_linear']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/int[@name='numFound'][.='3']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='term'][.='the returned']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/long[@name='weight'][.='10']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[1]/str[@name='payload'][.='ret']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='term'][.='top of the lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/long[@name='weight'][.='6']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[2]/str[@name='payload'][.='lake']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='term'][.='star wars: episode v - the empire strikes back']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/long[@name='weight'][.='2']", + "//lst[@name='suggest']/lst[@name='blended_infix_suggest_reciprocal']/lst[@name='the']/arr[@name='suggestions']/lst[3]/str[@name='payload'][.='star']" + ); + } + +}