SOLR-2848: generalize distributed spellcheck code to work with any SolrSpellChecker

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200266 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-11-10 10:54:46 +00:00
parent ee293e7e7d
commit b97d321f82
11 changed files with 326 additions and 202 deletions

View File

@ -268,6 +268,10 @@ Bug Fixes
equals methods. (Yonik Seeley, Hossman, Erick Erickson. equals methods. (Yonik Seeley, Hossman, Erick Erickson.
Marc Tinnemeyer caught the bug) Marc Tinnemeyer caught the bug)
* SOLR-2848: Removed 'instanceof AbstractLuceneSpellChecker' hacks from distributed spellchecking code,
and added a merge() method to SolrSpellChecker instead. Previously if you extended SolrSpellChecker
your spellchecker would not work in distributed fashion. (James Dyer via rmuir)
Other Changes Other Changes
---------------------- ----------------------

View File

@ -22,6 +22,8 @@ import java.io.StringReader;
import java.util.*; import java.util.*;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.LevensteinDistance; import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.StringDistance; import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.spell.SuggestWord;
@ -147,7 +149,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
IndexReader reader = rb.req.getSearcher().getIndexReader(); IndexReader reader = rb.req.getSearcher().getIndexReader();
boolean collate = params.getBool(SPELLCHECK_COLLATE, false); boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE); float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest); SolrParams customParams = getCustomParams(getDictionaryName(params), params);
SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults, SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults,
accuracy, customParams); accuracy, customParams);
SpellingResult spellingResult = spellChecker.getSuggestions(options); SpellingResult spellingResult = spellChecker.getSuggestions(options);
@ -210,7 +212,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
* @param params The original SolrParams * @param params The original SolrParams
* @return The new Params * @return The new Params
*/ */
protected SolrParams getCustomParams(String dictionary, SolrParams params, boolean shardRequest) { protected SolrParams getCustomParams(String dictionary, SolrParams params) {
ModifiableSolrParams result = new ModifiableSolrParams(); ModifiableSolrParams result = new ModifiableSolrParams();
Iterator<String> iter = params.getParameterNamesIterator(); Iterator<String> iter = params.getParameterNamesIterator();
String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + "."; String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + ".";
@ -220,10 +222,6 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
result.add(nxt.substring(prefix.length()), params.getParams(nxt)); result.add(nxt.substring(prefix.length()), params.getParams(nxt));
} }
} }
if(shardRequest)
{
result.add(ShardParams.IS_SHARD, "true");
}
return result; return result;
} }
@ -256,6 +254,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false); boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false);
int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0); int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0);
int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1); int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1);
int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1);
int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
String origQuery = params.get(SPELLCHECK_Q); String origQuery = params.get(SPELLCHECK_Q);
if (origQuery == null) { if (origQuery == null) {
@ -265,190 +265,28 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
} }
} }
int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1); SpellCheckMergeData mergeData = new SpellCheckMergeData();
float min = 0.5f;
StringDistance sd = null;
int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
SolrSpellChecker checker = getSpellChecker(rb.req.getParams());
if (checker instanceof AbstractLuceneSpellChecker) {
AbstractLuceneSpellChecker spellChecker = (AbstractLuceneSpellChecker) checker;
min = spellChecker.getAccuracy();
sd = spellChecker.getStringDistance();
}
if (sd == null)
sd = new LevensteinDistance();
Collection<Token> tokens = null;
try {
tokens = getTokens(origQuery, checker.getQueryAnalyzer());
} catch (IOException e) {
LOG.error("Could not get tokens (this should never happen)", e);
}
// original token -> corresponding Suggestion object (keep track of start,end)
Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
// original token string -> summed up frequency
Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
// original token string -> # of shards reporting it as misspelled
Map<String, Integer> origVsShards = new HashMap<String, Integer>();
// original token string -> set of alternatives
// must preserve order because collation algorithm can only work in-order
Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
// alternative string -> corresponding SuggestWord object
Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
Map<String, SpellCheckCollation> collations = new HashMap<String, SpellCheckCollation>();
int totalNumberShardResponses = 0;
for (ShardRequest sreq : rb.finished) { for (ShardRequest sreq : rb.finished) {
for (ShardResponse srsp : sreq.responses) { for (ShardResponse srsp : sreq.responses) {
NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck"); NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
LOG.info(srsp.getShard() + " " + nl); LOG.info(srsp.getShard() + " " + nl);
if (nl != null) { if (nl != null) {
totalNumberShardResponses++; mergeData.totalNumberShardResponses++;
SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl); collectShardSuggestions(nl, mergeData);
for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) { collectShardCollations(mergeData, nl, maxCollationTries);
origVsSuggestion.put(suggestion.getToken(), suggestion);
HashSet<String> suggested = origVsSuggested.get(suggestion.getToken());
if (suggested == null) {
suggested = new HashSet<String>();
origVsSuggested.put(suggestion.getToken(), suggested);
}
// sum up original frequency
int origFreq = 0;
Integer o = origVsFreq.get(suggestion.getToken());
if (o != null) origFreq += o;
origFreq += suggestion.getOriginalFrequency();
origVsFreq.put(suggestion.getToken(), origFreq);
//# shards reporting
Integer origShards = origVsShards.get(suggestion.getToken());
if(origShards==null) {
origVsShards.put(suggestion.getToken(), 1);
} else {
origVsShards.put(suggestion.getToken(), ++origShards);
}
// find best suggestions
for (int i = 0; i < suggestion.getNumFound(); i++) {
String alternative = suggestion.getAlternatives().get(i);
suggested.add(alternative);
SuggestWord sug = suggestedVsWord.get(alternative);
if (sug == null) {
sug = new SuggestWord();
suggestedVsWord.put(alternative, sug);
}
sug.string = alternative;
// alternative frequency is present only for extendedResults=true
if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) {
Integer freq = suggestion.getAlternativeFrequencies().get(i);
if (freq != null) sug.freq += freq;
}
}
}
NamedList suggestions = (NamedList) nl.get("suggestions");
if(suggestions != null) {
List<Object> collationList = suggestions.getAll("collation");
List<Object> collationRankList = suggestions.getAll("collationInternalRank");
int i=0;
if(collationList != null) {
for(Object o : collationList)
{
if(o instanceof String)
{
SpellCheckCollation coll = new SpellCheckCollation();
coll.setCollationQuery((String) o);
if(collationRankList!= null && collationRankList.size()>0)
{
coll.setInternalRank((Integer) collationRankList.get(i));
i++;
}
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
if(priorColl != null)
{
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
}
collations.put(coll.getCollationQuery(), coll);
} else
{
NamedList expandedCollation = (NamedList) o;
SpellCheckCollation coll = new SpellCheckCollation();
coll.setCollationQuery((String) expandedCollation.get("collationQuery"));
coll.setHits((Integer) expandedCollation.get("hits"));
if(maxCollationTries>0)
{
coll.setInternalRank((Integer) expandedCollation.get("collationInternalRank"));
}
coll.setMisspellingsAndCorrections((NamedList) expandedCollation.get("misspellingsAndCorrections"));
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
if(priorColl != null)
{
coll.setHits(coll.getHits() + priorColl.getHits());
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
}
collations.put(coll.getCollationQuery(), coll);
}
}
}
}
} }
} }
} }
// all shard responses have been collected // all shard responses have been collected
// create token and get top suggestions // create token and get top suggestions
SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand? SolrSpellChecker checker = getSpellChecker(rb.req.getParams());
for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet()) { SpellingResult result = checker.mergeSuggestions(mergeData, numSug, count, extendedResults);
String original = entry.getKey();
//Only use this suggestion if all shards reported it as misspelled.
Integer numShards = origVsShards.get(original);
if(numShards<totalNumberShardResponses) {
continue;
}
HashSet<String> suggested = entry.getValue();
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
for (String suggestion : suggested) {
SuggestWord sug = suggestedVsWord.get(suggestion);
sug.score = sd.getDistance(original, sug.string);
if (sug.score < min) continue;
sugQueue.insertWithOverflow(sug);
if (sugQueue.size() == numSug) {
// if queue full, maintain the minScore score
min = sugQueue.top().score;
}
}
// create token
SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
// get top 'count' suggestions out of 'sugQueue.size()' candidates
SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
// skip the first sugQueue.size() - count elements
for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop();
// now collect the top 'count' responses
for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
suggestions[k] = sugQueue.pop();
}
if (extendedResults) {
Integer o = origVsFreq.get(original);
if (o != null) result.addFrequency(token, o);
for (SuggestWord word : suggestions)
result.add(token, word.string, word.freq);
} else {
List<String> words = new ArrayList<String>(sugQueue.size());
for (SuggestWord word : suggestions) words.add(word.string);
result.add(token, words);
}
}
NamedList response = new SimpleOrderedMap(); NamedList response = new SimpleOrderedMap();
NamedList suggestions = toNamedList(false, result, origQuery, extendedResults, collate); NamedList suggestions = toNamedList(false, result, origQuery, extendedResults, collate);
if (collate) { if (collate) {
SpellCheckCollation[] sortedCollations = collations.values().toArray(new SpellCheckCollation[collations.size()]); SpellCheckCollation[] sortedCollations = mergeData.collations.values().toArray(new SpellCheckCollation[mergeData.collations.size()]);
Arrays.sort(sortedCollations); Arrays.sort(sortedCollations);
int i = 0; int i = 0;
while (i < maxCollations && i < sortedCollations.length) { while (i < maxCollations && i < sortedCollations.length) {
@ -471,6 +309,101 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
rb.rsp.add("spellcheck", response); rb.rsp.add("spellcheck", response);
} }
@SuppressWarnings("unchecked")
private void collectShardSuggestions(NamedList nl, SpellCheckMergeData mergeData) {
SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
mergeData.origVsSuggestion.put(suggestion.getToken(), suggestion);
HashSet<String> suggested = mergeData.origVsSuggested.get(suggestion.getToken());
if (suggested == null) {
suggested = new HashSet<String>();
mergeData.origVsSuggested.put(suggestion.getToken(), suggested);
}
// sum up original frequency
int origFreq = 0;
Integer o = mergeData.origVsFreq.get(suggestion.getToken());
if (o != null) origFreq += o;
origFreq += suggestion.getOriginalFrequency();
mergeData.origVsFreq.put(suggestion.getToken(), origFreq);
//# shards reporting
Integer origShards = mergeData.origVsShards.get(suggestion.getToken());
if(origShards==null) {
mergeData.origVsShards.put(suggestion.getToken(), 1);
} else {
mergeData.origVsShards.put(suggestion.getToken(), ++origShards);
}
// find best suggestions
for (int i = 0; i < suggestion.getNumFound(); i++) {
String alternative = suggestion.getAlternatives().get(i);
suggested.add(alternative);
SuggestWord sug = mergeData.suggestedVsWord.get(alternative);
if (sug == null) {
sug = new SuggestWord();
mergeData.suggestedVsWord.put(alternative, sug);
}
sug.string = alternative;
// alternative frequency is present only for extendedResults=true
if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) {
Integer freq = suggestion.getAlternativeFrequencies().get(i);
if (freq != null) sug.freq += freq;
}
}
}
}
@SuppressWarnings("unchecked")
private void collectShardCollations(SpellCheckMergeData mergeData, NamedList spellCheckResponse, int maxCollationTries) {
Map<String, SpellCheckCollation> collations = mergeData.collations;
NamedList suggestions = (NamedList) spellCheckResponse.get("suggestions");
if(suggestions != null) {
List<Object> collationList = suggestions.getAll("collation");
List<Object> collationRankList = suggestions.getAll("collationInternalRank");
int i=0;
if(collationList != null) {
for(Object o : collationList)
{
if(o instanceof String)
{
SpellCheckCollation coll = new SpellCheckCollation();
coll.setCollationQuery((String) o);
if(collationRankList!= null && collationRankList.size()>0)
{
coll.setInternalRank((Integer) collationRankList.get(i));
i++;
}
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
if(priorColl != null)
{
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
}
collations.put(coll.getCollationQuery(), coll);
} else
{
NamedList expandedCollation = (NamedList) o;
SpellCheckCollation coll = new SpellCheckCollation();
coll.setCollationQuery((String) expandedCollation.get("collationQuery"));
coll.setHits((Integer) expandedCollation.get("hits"));
if(maxCollationTries>0)
{
coll.setInternalRank((Integer) expandedCollation.get("collationInternalRank"));
}
coll.setMisspellingsAndCorrections((NamedList) expandedCollation.get("misspellingsAndCorrections"));
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
if(priorColl != null)
{
coll.setHits(coll.getHits() + priorColl.getHits());
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
}
collations.put(coll.getCollationQuery(), coll);
}
}
}
}
}
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException { private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<Token>(); Collection<Token> result = new ArrayList<Token>();
assert analyzer != null; assert analyzer != null;

View File

@ -0,0 +1,43 @@
package org.apache.solr.handler.component;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.solr.client.solrj.response.SpellCheckResponse;
import org.apache.solr.spelling.SpellCheckCollation;
public class SpellCheckMergeData {
//original token -> corresponding Suggestion object (keep track of start,end)
public Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
// original token string -> summed up frequency
public Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
// original token string -> # of shards reporting it as misspelled
public Map<String, Integer> origVsShards = new HashMap<String, Integer>();
// original token string -> set of alternatives
// must preserve order because collation algorithm can only work in-order
public Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
// alternative string -> corresponding SuggestWord object
public Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
public Map<String, SpellCheckCollation> collations = new HashMap<String, SpellCheckCollation>();
public int totalNumberShardResponses = 0;
}

View File

@ -141,13 +141,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
@Override @Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException { public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
boolean shardRequest = false; SpellingResult result = new SpellingResult(options.tokens);
SolrParams params = options.customParams;
if(params!=null)
{
shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
}
SpellingResult result = new SpellingResult(options.tokens);
IndexReader reader = determineReader(options.reader); IndexReader reader = determineReader(options.reader);
Term term = field != null ? new Term(field, "") : null; Term term = field != null ? new Term(field, "") : null;
float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy; float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy;
@ -176,7 +170,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
term = new Term(field, suggestions[i]); term = new Term(field, suggestions[i]);
result.add(token, suggestions[i], reader.docFreq(term)); result.add(token, suggestions[i], reader.docFreq(term));
} }
} else if(shardRequest) { } else {
List<String> suggList = Collections.emptyList(); List<String> suggList = Collections.emptyList();
result.add(token, suggList); result.add(token, suggList);
} }
@ -187,7 +181,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
suggList = suggList.subList(0, options.count); suggList = suggList.subList(0, options.count);
} }
result.add(token, suggList); result.add(token, suggList);
} else if(shardRequest) { } else {
List<String> suggList = Collections.emptyList(); List<String> suggList = Collections.emptyList();
result.add(token, suggList); result.add(token, suggList);
} }
@ -222,6 +216,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
/* /*
* @return the Accuracy used for the Spellchecker * @return the Accuracy used for the Spellchecker
* */ * */
@Override
public float getAccuracy() { public float getAccuracy() {
return accuracy; return accuracy;
} }
@ -257,6 +252,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
return sourceLocation; return sourceLocation;
} }
@Override
public StringDistance getStringDistance() { public StringDistance getStringDistance() {
return sd; return sd;
} }

View File

@ -18,7 +18,9 @@ package org.apache.solr.spelling;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
@ -29,6 +31,8 @@ import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator; import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue; import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.SpellingParams; import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrCore;
@ -182,11 +186,33 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
Term term = new Term(field, token.toString()); Term term = new Term(field, token.toString());
SuggestWord[] suggestions = checker.suggestSimilar(term, SuggestWord[] suggestions = checker.suggestSimilar(term,
options.count, options.reader, mode, accuracy); options.count, options.reader, mode, accuracy);
result.addFrequency(token, options.reader.docFreq(term));
for (SuggestWord suggestion : suggestions) { int docFreq = 0;
result.add(token, suggestion.string, suggestion.freq); if(options.extendedResults || suggestions.length==0) {
docFreq = options.reader.docFreq(term);
}
if(options.extendedResults) {
result.addFrequency(token, docFreq);
}
if(suggestions.length==0 && docFreq==0) {
List<String> empty = Collections.emptyList();
result.add(token, empty);
} else {
for (SuggestWord suggestion : suggestions) {
result.add(token, suggestion.string, suggestion.freq);
}
} }
} }
return result; return result;
} }
@Override
public float getAccuracy() {
return checker.getAccuracy();
}
@Override
public StringDistance getStringDistance() {
return checker.getDistance();
}
} }

View File

@ -59,6 +59,9 @@ public class PossibilityIterator implements Iterator<RankedSpellPossibility> {
public PossibilityIterator(Map<Token, LinkedHashMap<String, Integer>> suggestions, int maximumRequiredSuggestions, int maxEvaluations) { public PossibilityIterator(Map<Token, LinkedHashMap<String, Integer>> suggestions, int maximumRequiredSuggestions, int maxEvaluations) {
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) { for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
Token token = entry.getKey(); Token token = entry.getKey();
if(entry.getValue().size()==0) {
continue;
}
List<SpellCheckCorrection> possibleCorrections = new ArrayList<SpellCheckCorrection>(); List<SpellCheckCorrection> possibleCorrections = new ArrayList<SpellCheckCorrection>();
for (Map.Entry<String, Integer> entry1 : entry.getValue().entrySet()) { for (Map.Entry<String, Integer> entry1 : entry.getValue().entrySet()) {
SpellCheckCorrection correction = new SpellCheckCorrection(); SpellCheckCorrection correction = new SpellCheckCorrection();

View File

@ -17,13 +17,24 @@ package org.apache.solr.spelling;
*/ */
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.solr.client.solrj.response.SpellCheckResponse;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SpellCheckMergeData;
import org.apache.solr.schema.FieldType; import org.apache.solr.schema.FieldType;
import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrIndexSearcher;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
/** /**
@ -64,6 +75,74 @@ public abstract class SolrSpellChecker {
} }
return name; return name;
} }
/**
* Integrate spelling suggestions from the various shards in a distributed environment.
*
* @param mergeData
* @param numSug
* @param count
* @param extendedResults
* @return
*/
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
float min = 0.5f;
try {
min = getAccuracy();
} catch(UnsupportedOperationException uoe) {
//just use .5 as a default
}
StringDistance sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
SpellingResult result = new SpellingResult();
for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
String original = entry.getKey();
//Only use this suggestion if all shards reported it as misspelled.
Integer numShards = mergeData.origVsShards.get(original);
if(numShards<mergeData.totalNumberShardResponses) {
continue;
}
HashSet<String> suggested = entry.getValue();
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
for (String suggestion : suggested) {
SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
sug.score = sd.getDistance(original, sug.string);
if (sug.score < min) continue;
sugQueue.insertWithOverflow(sug);
if (sugQueue.size() == numSug) {
// if queue full, maintain the minScore score
min = sugQueue.top().score;
}
}
// create token
SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
// get top 'count' suggestions out of 'sugQueue.size()' candidates
SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
// skip the first sugQueue.size() - count elements
for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop();
// now collect the top 'count' responses
for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
suggestions[k] = sugQueue.pop();
}
if (extendedResults) {
Integer o = mergeData.origVsFreq.get(original);
if (o != null) result.addFrequency(token, o);
for (SuggestWord word : suggestions)
result.add(token, word.string, word.freq);
} else {
List<String> words = new ArrayList<String>(sugQueue.size());
for (SuggestWord word : suggestions) words.add(word.string);
result.add(token, words);
}
}
return result;
}
public Analyzer getQueryAnalyzer() { public Analyzer getQueryAnalyzer() {
return analyzer; return analyzer;
@ -85,6 +164,23 @@ public abstract class SolrSpellChecker {
*/ */
public abstract void build(SolrCore core, SolrIndexSearcher searcher); public abstract void build(SolrCore core, SolrIndexSearcher searcher);
/**
* Get the value of {@link SpellingParams.SPELLCHECK_ACCURACY} if supported.
* Otherwise throws UnsupportedOperationException.
* @return
*/
protected float getAccuracy() {
throw new UnsupportedOperationException();
}
/**
* Get the distance implementation used by this spellchecker, or NULL if not applicable.
* @return
*/
protected StringDistance getStringDistance() {
throw new UnsupportedOperationException();
}
/** /**
* Get suggestions for the given query. Tokenizes the query using a field appropriate Analyzer. * Get suggestions for the given query. Tokenizes the query using a field appropriate Analyzer.

View File

@ -323,6 +323,12 @@
<str name="spellcheckIndexDir">spellchecker1</str> <str name="spellcheckIndexDir">spellchecker1</str>
<str name="buildOnCommit">false</str> <str name="buildOnCommit">false</str>
</lst> </lst>
<lst name="spellchecker">
<str name="name">direct</str>
<str name="classname">DirectSolrSpellChecker</str>
<str name="field">lowerfilt</str>
<int name="minQueryLength">3</int>
</lst>
<lst name="spellchecker"> <lst name="spellchecker">
<str name="name">multipleFields</str> <str name="name">multipleFields</str>
<str name="field">lowerfilt1and2</str> <str name="field">lowerfilt1and2</str>
@ -397,6 +403,17 @@
<arr name="last-components"> <arr name="last-components">
<str>spellcheck</str> <str>spellcheck</str>
</arr> </arr>
</requestHandler>
<requestHandler name="spellCheckCompRH_Direct" class="org.apache.solr.handler.component.SearchHandler">
<lst name="defaults">
<str name="spellcheck.dictionary">direct</str>
<str name="spellcheck.onlyMorePopular">false</str>
<str name="spellcheck.extendedResults">false</str>
<str name="spellcheck.count">1</str>
</lst>
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler> </requestHandler>
<requestHandler name="spellCheckCompRH1" class="org.apache.solr.handler.component.SearchHandler"> <requestHandler name="spellCheckCompRH1" class="org.apache.solr.handler.component.SearchHandler">
<lst name="defaults"> <lst name="defaults">

View File

@ -30,10 +30,13 @@ import org.apache.solr.common.params.ModifiableSolrParams;
*/ */
public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase { public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase {
private String requestHandlerName;
public DistributedSpellCheckComponentTest() public DistributedSpellCheckComponentTest()
{ {
//fixShardCount=true; //fixShardCount=true;
//shardCount=2; //shardCount=2;
//stress=0;
} }
private String saveProp; private String saveProp;
@ -42,6 +45,7 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
// this test requires FSDir // this test requires FSDir
saveProp = System.getProperty("solr.directoryFactory"); saveProp = System.getProperty("solr.directoryFactory");
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
requestHandlerName = random.nextBoolean() ? "spellCheckCompRH" : "spellCheckCompRH_Direct";
super.setUp(); super.setUp();
} }
@ -104,15 +108,17 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
handle.put("maxScore", SKIPVAL); handle.put("maxScore", SKIPVAL);
// we care only about the spellcheck results // we care only about the spellcheck results
handle.put("response", SKIP); handle.put("response", SKIP);
q("q", "*:*", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH"); q("q", "*:*", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH");
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH"); query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName);
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true"); query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true");
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4"); query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4");
query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true"); query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true");
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true");
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "0", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true");
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "0", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
} }
} }

View File

@ -175,7 +175,7 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 {
result = checker.getSuggestions(spellOpts); result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null); assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(spellOpts.tokens.iterator().next()); suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null); assertTrue("suggestions size should be 0", suggestions.size()==0);
searcher.decref(); searcher.decref();
} }
} }

View File

@ -140,7 +140,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
result = checker.getSuggestions(spellOpts); result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null); assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(spellOpts.tokens.iterator().next()); suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null); assertTrue("suggestions size should be 0", suggestions.size()==0);
//test something that is spelled correctly //test something that is spelled correctly
spellOpts.tokens = queryConverter.convert("document"); spellOpts.tokens = queryConverter.convert("document");
@ -215,7 +215,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
result = checker.getSuggestions(spellOpts); result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null); assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(spellOpts.tokens.iterator().next()); suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null); assertTrue("suggestions size should be 0", suggestions.size()==0);
spellOpts.tokens = queryConverter.convert("document"); spellOpts.tokens = queryConverter.convert("document");
result = checker.getSuggestions(spellOpts); result = checker.getSuggestions(spellOpts);
@ -328,7 +328,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
result = checker.getSuggestions(spellOpts); result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null); assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(spellOpts.tokens.iterator().next()); suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null); assertTrue("suggestions size should be 0", suggestions.size()==0);
spellOpts.tokens = queryConverter.convert("Caroline"); spellOpts.tokens = queryConverter.convert("Caroline");
result = checker.getSuggestions(spellOpts); result = checker.getSuggestions(spellOpts);