diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 8791f7c403f..3af96bb8e8f 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -267,6 +267,10 @@ Bug Fixes * SOLR-2829: Fix problem with false-positives due to incorrect equals methods. (Yonik Seeley, Hossman, Erick Erickson. Marc Tinnemeyer caught the bug) + +* SOLR-2848: Removed 'instanceof AbstractLuceneSpellChecker' hacks from distributed spellchecking code, + and added a merge() method to SolrSpellChecker instead. Previously if you extended SolrSpellChecker + your spellchecker would not work in distributed fashion. (James Dyer via rmuir) Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java index 631e33a29a9..6c94dca15f7 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java @@ -22,6 +22,8 @@ import java.io.StringReader; import java.util.*; import java.util.concurrent.ConcurrentHashMap; +import org.apache.lucene.search.spell.DirectSpellChecker; +import org.apache.lucene.search.spell.JaroWinklerDistance; import org.apache.lucene.search.spell.LevensteinDistance; import org.apache.lucene.search.spell.StringDistance; import org.apache.lucene.search.spell.SuggestWord; @@ -147,7 +149,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar IndexReader reader = rb.req.getSearcher().getIndexReader(); boolean collate = params.getBool(SPELLCHECK_COLLATE, false); float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE); - SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest); + SolrParams customParams = getCustomParams(getDictionaryName(params), params); SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults, accuracy, customParams); SpellingResult spellingResult = spellChecker.getSuggestions(options); @@ -210,7 +212,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar * @param params The original SolrParams * @return The new Params */ - protected SolrParams getCustomParams(String dictionary, SolrParams params, boolean shardRequest) { + protected SolrParams getCustomParams(String dictionary, SolrParams params) { ModifiableSolrParams result = new ModifiableSolrParams(); Iterator iter = params.getParameterNamesIterator(); String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + "."; @@ -220,10 +222,6 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar result.add(nxt.substring(prefix.length()), params.getParams(nxt)); } } - if(shardRequest) - { - result.add(ShardParams.IS_SHARD, "true"); - } return result; } @@ -256,6 +254,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false); int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0); int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1); + int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1); + int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT); String origQuery = params.get(SPELLCHECK_Q); if (origQuery == null) { @@ -263,192 +263,30 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar if (origQuery == null) { origQuery = params.get(CommonParams.Q); } - } - - int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1); - float min = 0.5f; - StringDistance sd = null; - int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT); - SolrSpellChecker checker = getSpellChecker(rb.req.getParams()); - if (checker instanceof AbstractLuceneSpellChecker) { - AbstractLuceneSpellChecker spellChecker = (AbstractLuceneSpellChecker) checker; - min = spellChecker.getAccuracy(); - sd = spellChecker.getStringDistance(); - } - if (sd == null) - sd = new LevensteinDistance(); - - Collection tokens = null; - try { - tokens = getTokens(origQuery, checker.getQueryAnalyzer()); - } catch (IOException e) { - LOG.error("Could not get tokens (this should never happen)", e); - } - - // original token -> corresponding Suggestion object (keep track of start,end) - Map origVsSuggestion = new HashMap(); - // original token string -> summed up frequency - Map origVsFreq = new HashMap(); - // original token string -> # of shards reporting it as misspelled - Map origVsShards = new HashMap(); - // original token string -> set of alternatives - // must preserve order because collation algorithm can only work in-order - Map> origVsSuggested = new LinkedHashMap>(); - // alternative string -> corresponding SuggestWord object - Map suggestedVsWord = new HashMap(); - Map collations = new HashMap(); + } - int totalNumberShardResponses = 0; + SpellCheckMergeData mergeData = new SpellCheckMergeData(); for (ShardRequest sreq : rb.finished) { for (ShardResponse srsp : sreq.responses) { NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck"); LOG.info(srsp.getShard() + " " + nl); if (nl != null) { - totalNumberShardResponses++; - SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl); - for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) { - origVsSuggestion.put(suggestion.getToken(), suggestion); - HashSet suggested = origVsSuggested.get(suggestion.getToken()); - if (suggested == null) { - suggested = new HashSet(); - origVsSuggested.put(suggestion.getToken(), suggested); - } - - // sum up original frequency - int origFreq = 0; - Integer o = origVsFreq.get(suggestion.getToken()); - if (o != null) origFreq += o; - origFreq += suggestion.getOriginalFrequency(); - origVsFreq.put(suggestion.getToken(), origFreq); - - //# shards reporting - Integer origShards = origVsShards.get(suggestion.getToken()); - if(origShards==null) { - origVsShards.put(suggestion.getToken(), 1); - } else { - origVsShards.put(suggestion.getToken(), ++origShards); - } - - // find best suggestions - for (int i = 0; i < suggestion.getNumFound(); i++) { - String alternative = suggestion.getAlternatives().get(i); - suggested.add(alternative); - SuggestWord sug = suggestedVsWord.get(alternative); - if (sug == null) { - sug = new SuggestWord(); - suggestedVsWord.put(alternative, sug); - } - sug.string = alternative; - // alternative frequency is present only for extendedResults=true - if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) { - Integer freq = suggestion.getAlternativeFrequencies().get(i); - if (freq != null) sug.freq += freq; - } - } - } - NamedList suggestions = (NamedList) nl.get("suggestions"); - if(suggestions != null) { - List collationList = suggestions.getAll("collation"); - List collationRankList = suggestions.getAll("collationInternalRank"); - int i=0; - if(collationList != null) { - for(Object o : collationList) - { - if(o instanceof String) - { - SpellCheckCollation coll = new SpellCheckCollation(); - coll.setCollationQuery((String) o); - if(collationRankList!= null && collationRankList.size()>0) - { - coll.setInternalRank((Integer) collationRankList.get(i)); - i++; - } - SpellCheckCollation priorColl = collations.get(coll.getCollationQuery()); - if(priorColl != null) - { - coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank())); - } - collations.put(coll.getCollationQuery(), coll); - } else - { - NamedList expandedCollation = (NamedList) o; - SpellCheckCollation coll = new SpellCheckCollation(); - coll.setCollationQuery((String) expandedCollation.get("collationQuery")); - coll.setHits((Integer) expandedCollation.get("hits")); - if(maxCollationTries>0) - { - coll.setInternalRank((Integer) expandedCollation.get("collationInternalRank")); - } - coll.setMisspellingsAndCorrections((NamedList) expandedCollation.get("misspellingsAndCorrections")); - SpellCheckCollation priorColl = collations.get(coll.getCollationQuery()); - if(priorColl != null) - { - coll.setHits(coll.getHits() + priorColl.getHits()); - coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank())); - } - collations.put(coll.getCollationQuery(), coll); - } - } - } - } + mergeData.totalNumberShardResponses++; + collectShardSuggestions(nl, mergeData); + collectShardCollations(mergeData, nl, maxCollationTries); } } } // all shard responses have been collected // create token and get top suggestions - SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand? - for (Map.Entry> entry : origVsSuggested.entrySet()) { - String original = entry.getKey(); - - //Only use this suggestion if all shards reported it as misspelled. - Integer numShards = origVsShards.get(original); - if(numShards suggested = entry.getValue(); - SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); - for (String suggestion : suggested) { - SuggestWord sug = suggestedVsWord.get(suggestion); - sug.score = sd.getDistance(original, sug.string); - if (sug.score < min) continue; - sugQueue.insertWithOverflow(sug); - if (sugQueue.size() == numSug) { - // if queue full, maintain the minScore score - min = sugQueue.top().score; - } - } - - // create token - SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original); - Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset()); - - // get top 'count' suggestions out of 'sugQueue.size()' candidates - SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())]; - // skip the first sugQueue.size() - count elements - for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop(); - // now collect the top 'count' responses - for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) { - suggestions[k] = sugQueue.pop(); - } - - if (extendedResults) { - Integer o = origVsFreq.get(original); - if (o != null) result.addFrequency(token, o); - for (SuggestWord word : suggestions) - result.add(token, word.string, word.freq); - } else { - List words = new ArrayList(sugQueue.size()); - for (SuggestWord word : suggestions) words.add(word.string); - result.add(token, words); - } - } + SolrSpellChecker checker = getSpellChecker(rb.req.getParams()); + SpellingResult result = checker.mergeSuggestions(mergeData, numSug, count, extendedResults); NamedList response = new SimpleOrderedMap(); NamedList suggestions = toNamedList(false, result, origQuery, extendedResults, collate); if (collate) { - SpellCheckCollation[] sortedCollations = collations.values().toArray(new SpellCheckCollation[collations.size()]); + SpellCheckCollation[] sortedCollations = mergeData.collations.values().toArray(new SpellCheckCollation[mergeData.collations.size()]); Arrays.sort(sortedCollations); int i = 0; while (i < maxCollations && i < sortedCollations.length) { @@ -470,6 +308,101 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar response.add("suggestions", suggestions); rb.rsp.add("spellcheck", response); } + + @SuppressWarnings("unchecked") + private void collectShardSuggestions(NamedList nl, SpellCheckMergeData mergeData) { + SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl); + for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) { + mergeData.origVsSuggestion.put(suggestion.getToken(), suggestion); + HashSet suggested = mergeData.origVsSuggested.get(suggestion.getToken()); + if (suggested == null) { + suggested = new HashSet(); + mergeData.origVsSuggested.put(suggestion.getToken(), suggested); + } + + // sum up original frequency + int origFreq = 0; + Integer o = mergeData.origVsFreq.get(suggestion.getToken()); + if (o != null) origFreq += o; + origFreq += suggestion.getOriginalFrequency(); + mergeData.origVsFreq.put(suggestion.getToken(), origFreq); + + //# shards reporting + Integer origShards = mergeData.origVsShards.get(suggestion.getToken()); + if(origShards==null) { + mergeData.origVsShards.put(suggestion.getToken(), 1); + } else { + mergeData.origVsShards.put(suggestion.getToken(), ++origShards); + } + + // find best suggestions + for (int i = 0; i < suggestion.getNumFound(); i++) { + String alternative = suggestion.getAlternatives().get(i); + suggested.add(alternative); + SuggestWord sug = mergeData.suggestedVsWord.get(alternative); + if (sug == null) { + sug = new SuggestWord(); + mergeData.suggestedVsWord.put(alternative, sug); + } + sug.string = alternative; + // alternative frequency is present only for extendedResults=true + if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) { + Integer freq = suggestion.getAlternativeFrequencies().get(i); + if (freq != null) sug.freq += freq; + } + } + } + } + + @SuppressWarnings("unchecked") + private void collectShardCollations(SpellCheckMergeData mergeData, NamedList spellCheckResponse, int maxCollationTries) { + Map collations = mergeData.collations; + NamedList suggestions = (NamedList) spellCheckResponse.get("suggestions"); + if(suggestions != null) { + List collationList = suggestions.getAll("collation"); + List collationRankList = suggestions.getAll("collationInternalRank"); + int i=0; + if(collationList != null) { + for(Object o : collationList) + { + if(o instanceof String) + { + SpellCheckCollation coll = new SpellCheckCollation(); + coll.setCollationQuery((String) o); + if(collationRankList!= null && collationRankList.size()>0) + { + coll.setInternalRank((Integer) collationRankList.get(i)); + i++; + } + SpellCheckCollation priorColl = collations.get(coll.getCollationQuery()); + if(priorColl != null) + { + coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank())); + } + collations.put(coll.getCollationQuery(), coll); + } else + { + NamedList expandedCollation = (NamedList) o; + SpellCheckCollation coll = new SpellCheckCollation(); + coll.setCollationQuery((String) expandedCollation.get("collationQuery")); + coll.setHits((Integer) expandedCollation.get("hits")); + if(maxCollationTries>0) + { + coll.setInternalRank((Integer) expandedCollation.get("collationInternalRank")); + } + coll.setMisspellingsAndCorrections((NamedList) expandedCollation.get("misspellingsAndCorrections")); + SpellCheckCollation priorColl = collations.get(coll.getCollationQuery()); + if(priorColl != null) + { + coll.setHits(coll.getHits() + priorColl.getHits()); + coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank())); + } + collations.put(coll.getCollationQuery(), coll); + } + } + } + } + } private Collection getTokens(String q, Analyzer analyzer) throws IOException { Collection result = new ArrayList(); diff --git a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckMergeData.java b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckMergeData.java new file mode 100644 index 00000000000..e2de7621d12 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckMergeData.java @@ -0,0 +1,43 @@ +package org.apache.solr.handler.component; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; + +import org.apache.lucene.search.spell.SuggestWord; +import org.apache.solr.client.solrj.response.SpellCheckResponse; +import org.apache.solr.spelling.SpellCheckCollation; + +public class SpellCheckMergeData { + //original token -> corresponding Suggestion object (keep track of start,end) + public Map origVsSuggestion = new HashMap(); + // original token string -> summed up frequency + public Map origVsFreq = new HashMap(); + // original token string -> # of shards reporting it as misspelled + public Map origVsShards = new HashMap(); + // original token string -> set of alternatives + // must preserve order because collation algorithm can only work in-order + public Map> origVsSuggested = new LinkedHashMap>(); + // alternative string -> corresponding SuggestWord object + public Map suggestedVsWord = new HashMap(); + public Map collations = new HashMap(); + public int totalNumberShardResponses = 0; +} diff --git a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java index efd42b6ab25..91e7993aab0 100644 --- a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java +++ b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java @@ -141,13 +141,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { @Override public SpellingResult getSuggestions(SpellingOptions options) throws IOException { - boolean shardRequest = false; - SolrParams params = options.customParams; - if(params!=null) - { - shardRequest = "true".equals(params.get(ShardParams.IS_SHARD)); - } - SpellingResult result = new SpellingResult(options.tokens); + SpellingResult result = new SpellingResult(options.tokens); IndexReader reader = determineReader(options.reader); Term term = field != null ? new Term(field, "") : null; float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy; @@ -176,7 +170,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { term = new Term(field, suggestions[i]); result.add(token, suggestions[i], reader.docFreq(term)); } - } else if(shardRequest) { + } else { List suggList = Collections.emptyList(); result.add(token, suggList); } @@ -187,7 +181,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { suggList = suggList.subList(0, options.count); } result.add(token, suggList); - } else if(shardRequest) { + } else { List suggList = Collections.emptyList(); result.add(token, suggList); } @@ -222,6 +216,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { /* * @return the Accuracy used for the Spellchecker * */ + @Override public float getAccuracy() { return accuracy; } @@ -257,6 +252,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { return sourceLocation; } + @Override public StringDistance getStringDistance() { return sd; } diff --git a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java index b047522647d..f68b99a5fa1 100644 --- a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java +++ b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java @@ -18,7 +18,9 @@ package org.apache.solr.spelling; */ import java.io.IOException; +import java.util.Collections; import java.util.Comparator; +import java.util.List; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; @@ -29,6 +31,8 @@ import org.apache.lucene.search.spell.SuggestMode; import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.spell.SuggestWordFrequencyComparator; import org.apache.lucene.search.spell.SuggestWordQueue; +import org.apache.solr.common.params.ShardParams; +import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SpellingParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; @@ -174,19 +178,41 @@ public class DirectSolrSpellChecker extends SolrSpellChecker { public SpellingResult getSuggestions(SpellingOptions options) throws IOException { LOG.debug("getSuggestions: " + options.tokens); - + SpellingResult result = new SpellingResult(); float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy; SuggestMode mode = options.onlyMorePopular ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; for (Token token : options.tokens) { Term term = new Term(field, token.toString()); SuggestWord[] suggestions = checker.suggestSimilar(term, - options.count, options.reader, mode, accuracy); - result.addFrequency(token, options.reader.docFreq(term)); - for (SuggestWord suggestion : suggestions) { - result.add(token, suggestion.string, suggestion.freq); + options.count, options.reader, mode, accuracy); + + int docFreq = 0; + if(options.extendedResults || suggestions.length==0) { + docFreq = options.reader.docFreq(term); + } + + if(options.extendedResults) { + result.addFrequency(token, docFreq); + } + if(suggestions.length==0 && docFreq==0) { + List empty = Collections.emptyList(); + result.add(token, empty); + } else { + for (SuggestWord suggestion : suggestions) { + result.add(token, suggestion.string, suggestion.freq); + } } } return result; } + + @Override + public float getAccuracy() { + return checker.getAccuracy(); + } + @Override + public StringDistance getStringDistance() { + return checker.getDistance(); + } } diff --git a/solr/core/src/java/org/apache/solr/spelling/PossibilityIterator.java b/solr/core/src/java/org/apache/solr/spelling/PossibilityIterator.java index 84e41e27ede..a3deda4c784 100644 --- a/solr/core/src/java/org/apache/solr/spelling/PossibilityIterator.java +++ b/solr/core/src/java/org/apache/solr/spelling/PossibilityIterator.java @@ -59,6 +59,9 @@ public class PossibilityIterator implements Iterator { public PossibilityIterator(Map> suggestions, int maximumRequiredSuggestions, int maxEvaluations) { for (Map.Entry> entry : suggestions.entrySet()) { Token token = entry.getKey(); + if(entry.getValue().size()==0) { + continue; + } List possibleCorrections = new ArrayList(); for (Map.Entry entry1 : entry.getValue().entrySet()) { SpellCheckCorrection correction = new SpellCheckCorrection(); diff --git a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java index a115303d0e5..009567e9efe 100644 --- a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java +++ b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java @@ -17,13 +17,24 @@ package org.apache.solr.spelling; */ import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.search.spell.LevensteinDistance; +import org.apache.lucene.search.spell.StringDistance; +import org.apache.lucene.search.spell.SuggestWord; +import org.apache.lucene.search.spell.SuggestWordQueue; +import org.apache.solr.client.solrj.response.SpellCheckResponse; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.component.SpellCheckMergeData; import org.apache.solr.schema.FieldType; import org.apache.solr.search.SolrIndexSearcher; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; /** @@ -64,6 +75,74 @@ public abstract class SolrSpellChecker { } return name; } + /** + * Integrate spelling suggestions from the various shards in a distributed environment. + * + * @param mergeData + * @param numSug + * @param count + * @param extendedResults + * @return + */ + public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) { + float min = 0.5f; + try { + min = getAccuracy(); + } catch(UnsupportedOperationException uoe) { + //just use .5 as a default + } + + StringDistance sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance(); + + SpellingResult result = new SpellingResult(); + for (Map.Entry> entry : mergeData.origVsSuggested.entrySet()) { + String original = entry.getKey(); + + //Only use this suggestion if all shards reported it as misspelled. + Integer numShards = mergeData.origVsShards.get(original); + if(numShards suggested = entry.getValue(); + SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); + for (String suggestion : suggested) { + SuggestWord sug = mergeData.suggestedVsWord.get(suggestion); + sug.score = sd.getDistance(original, sug.string); + if (sug.score < min) continue; + sugQueue.insertWithOverflow(sug); + if (sugQueue.size() == numSug) { + // if queue full, maintain the minScore score + min = sugQueue.top().score; + } + } + + // create token + SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original); + Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset()); + + // get top 'count' suggestions out of 'sugQueue.size()' candidates + SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())]; + // skip the first sugQueue.size() - count elements + for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop(); + // now collect the top 'count' responses + for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) { + suggestions[k] = sugQueue.pop(); + } + + if (extendedResults) { + Integer o = mergeData.origVsFreq.get(original); + if (o != null) result.addFrequency(token, o); + for (SuggestWord word : suggestions) + result.add(token, word.string, word.freq); + } else { + List words = new ArrayList(sugQueue.size()); + for (SuggestWord word : suggestions) words.add(word.string); + result.add(token, words); + } + } + return result; + } public Analyzer getQueryAnalyzer() { return analyzer; @@ -84,6 +163,23 @@ public abstract class SolrSpellChecker { * (re)Builds the spelling index. May be a NOOP if the implementation doesn't require building, or can't be rebuilt. */ public abstract void build(SolrCore core, SolrIndexSearcher searcher); + + /** + * Get the value of {@link SpellingParams.SPELLCHECK_ACCURACY} if supported. + * Otherwise throws UnsupportedOperationException. + * @return + */ + protected float getAccuracy() { + throw new UnsupportedOperationException(); + } + + /** + * Get the distance implementation used by this spellchecker, or NULL if not applicable. + * @return + */ + protected StringDistance getStringDistance() { + throw new UnsupportedOperationException(); + } /** diff --git a/solr/core/src/test-files/solr/conf/solrconfig.xml b/solr/core/src/test-files/solr/conf/solrconfig.xml index 851e2c2cd91..8c916d599ec 100644 --- a/solr/core/src/test-files/solr/conf/solrconfig.xml +++ b/solr/core/src/test-files/solr/conf/solrconfig.xml @@ -323,6 +323,12 @@ spellchecker1 false + + direct + DirectSolrSpellChecker + lowerfilt + 3 + multipleFields lowerfilt1and2 @@ -397,6 +403,17 @@ spellcheck + + + + direct + false + false + 1 + + + spellcheck + diff --git a/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java b/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java index 73efb4b908e..b14b59a1bbe 100644 --- a/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java +++ b/solr/core/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java @@ -30,10 +30,13 @@ import org.apache.solr.common.params.ModifiableSolrParams; */ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase { + private String requestHandlerName; + public DistributedSpellCheckComponentTest() { //fixShardCount=true; //shardCount=2; + //stress=0; } private String saveProp; @@ -41,7 +44,8 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes public void setUp() throws Exception { // this test requires FSDir saveProp = System.getProperty("solr.directoryFactory"); - System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); + System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); + requestHandlerName = random.nextBoolean() ? "spellCheckCompRH" : "spellCheckCompRH_Direct"; super.setUp(); } @@ -104,15 +108,17 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes handle.put("maxScore", SKIPVAL); // we care only about the spellcheck results handle.put("response", SKIP); + q("q", "*:*", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH"); - query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH"); - query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true"); - query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4"); - query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true"); + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName); + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true"); + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4"); + query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true"); - query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true"); - query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false"); - query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "0", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false"); + query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true"); + query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false"); + query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "0", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false"); + } } diff --git a/solr/core/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java index 3c536058f1f..633d47a12fd 100644 --- a/solr/core/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java +++ b/solr/core/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java @@ -175,7 +175,7 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 { result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); suggestions = result.get(spellOpts.tokens.iterator().next()); - assertTrue("suggestions is not null and it should be", suggestions == null); + assertTrue("suggestions size should be 0", suggestions.size()==0); searcher.decref(); } } diff --git a/solr/core/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java index a17aad1d723..0983ae8fe0b 100644 --- a/solr/core/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java +++ b/solr/core/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java @@ -140,7 +140,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); suggestions = result.get(spellOpts.tokens.iterator().next()); - assertTrue("suggestions is not null and it should be", suggestions == null); + assertTrue("suggestions size should be 0", suggestions.size()==0); //test something that is spelled correctly spellOpts.tokens = queryConverter.convert("document"); @@ -215,7 +215,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); suggestions = result.get(spellOpts.tokens.iterator().next()); - assertTrue("suggestions is not null and it should be", suggestions == null); + assertTrue("suggestions size should be 0", suggestions.size()==0); spellOpts.tokens = queryConverter.convert("document"); result = checker.getSuggestions(spellOpts); @@ -328,7 +328,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); suggestions = result.get(spellOpts.tokens.iterator().next()); - assertTrue("suggestions is not null and it should be", suggestions == null); + assertTrue("suggestions size should be 0", suggestions.size()==0); spellOpts.tokens = queryConverter.convert("Caroline"); result = checker.getSuggestions(spellOpts);