SOLR-2083: fix issue with misrreporting suggestions in distributed spell checking

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@995964 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2010-09-10 20:24:41 +00:00
parent d38ec19a28
commit f6c0423c87
4 changed files with 89 additions and 15 deletions

View File

@ -477,6 +477,8 @@ Bug Fixes
* SOLR-2114: Fixed parsing error in hsin function. The function signature has changed slightly. (gsingers)
* SOLR-2083: SpellCheckComponent misreports suggestions when distributed (James Dyer via gsingers)
Other Changes
----------------------

View File

@ -46,6 +46,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.common.util.NamedList;
@ -123,6 +124,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
return;
}
boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
String q = params.get(SPELLCHECK_Q);
SolrSpellChecker spellChecker = getSpellChecker(params);
Collection<Token> tokens = null;
@ -147,13 +149,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
IndexReader reader = rb.req.getSearcher().getReader();
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
SolrParams customParams = getCustomParams(getDictionaryName(params), params);
SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest);
SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults,
accuracy, customParams);
SpellingResult spellingResult = spellChecker.getSuggestions(options);
if (spellingResult != null) {
response.add("suggestions", toNamedList(spellingResult, q,
response.add("suggestions", toNamedList(shardRequest, spellingResult, q,
extendedResults, collate));
rb.rsp.add("spellcheck", response);
}
@ -171,7 +172,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
* @param params The original SolrParams
* @return The new Params
*/
protected SolrParams getCustomParams(String dictionary, SolrParams params) {
protected SolrParams getCustomParams(String dictionary, SolrParams params, boolean shardRequest) {
ModifiableSolrParams result = new ModifiableSolrParams();
Iterator<String> iter = params.getParameterNamesIterator();
String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + ".";
@ -181,6 +182,10 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
result.add(nxt.substring(prefix.length()), params.getParams(nxt));
}
}
if(shardRequest)
{
result.add(ShardParams.IS_SHARD, "true");
}
return result;
}
@ -243,17 +248,21 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
// original token string -> summed up frequency
Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
// original token string -> # of shards reporting it as misspelled
Map<String, Integer> origVsShards = new HashMap<String, Integer>();
// original token string -> set of alternatives
// must preserve order because collation algorithm can only work in-order
Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
// alternative string -> corresponding SuggestWord object
Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
int totalNumberShardResponses = 0;
for (ShardRequest sreq : rb.finished) {
for (ShardResponse srsp : sreq.responses) {
NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
LOG.info(srsp.getShard() + " " + nl);
if (nl != null) {
totalNumberShardResponses++;
SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
origVsSuggestion.put(suggestion.getToken(), suggestion);
@ -270,6 +279,14 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
origFreq += suggestion.getOriginalFrequency();
origVsFreq.put(suggestion.getToken(), origFreq);
//# shards reporting
Integer origShards = origVsShards.get(suggestion.getToken());
if(origShards==null) {
origVsShards.put(suggestion.getToken(), 1);
} else {
origVsShards.put(suggestion.getToken(), ++origShards);
}
// find best suggestions
for (int i = 0; i < suggestion.getNumFound(); i++) {
String alternative = suggestion.getAlternatives().get(i);
@ -296,6 +313,13 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand?
for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet()) {
String original = entry.getKey();
//Only use this suggestion if all shards reported it as misspelled.
Integer numShards = origVsShards.get(original);
if(numShards<totalNumberShardResponses) {
continue;
}
HashSet<String> suggested = entry.getValue();
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
for (String suggestion : suggested) {
@ -335,7 +359,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
}
NamedList response = new SimpleOrderedMap();
response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate));
response.add("suggestions", toNamedList(false, result, origQuery, extendedResults, collate));
rb.rsp.add("spellcheck", response);
}
@ -383,7 +407,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
return spellCheckers.get(name);
}
protected NamedList toNamedList(SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
NamedList result = new NamedList();
Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
@ -393,15 +417,23 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
best = new LinkedHashMap<Token, String>(suggestions.size());
}
int numSuggestions = 0;
for(LinkedHashMap<String, Integer> theSuggestion : suggestions.values())
{
if(theSuggestion.size()>0)
{
numSuggestions++;
}
}
// will be flipped to false if any of the suggestions are not in the index and hasFreqInfo is true
if(suggestions.size() > 0) {
if(numSuggestions > 0) {
isCorrectlySpelled = true;
}
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
Token inputToken = entry.getKey();
Map<String, Integer> theSuggestions = entry.getValue();
if (theSuggestions != null && theSuggestions.size() > 0) {
if (theSuggestions != null && (theSuggestions.size()>0 || shardRequest)) {
SimpleOrderedMap suggestionList = new SimpleOrderedMap();
suggestionList.add("numFound", theSuggestions.size());
suggestionList.add("startOffset", inputToken.startOffset());
@ -430,7 +462,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
suggestionList.add("suggestion", theSuggestions.keySet());
}
if (collate == true ){//set aside the best suggestion for this token
if (collate == true && theSuggestions.size()>0){//set aside the best suggestion for this token
best.put(inputToken, theSuggestions.keySet().iterator().next());
}
if (hasFreqInfo) {

View File

@ -22,6 +22,7 @@ import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@ -41,6 +42,8 @@ import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
@ -153,6 +156,12 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
boolean shardRequest = false;
SolrParams params = options.customParams;
if(params!=null)
{
shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
}
SpellingResult result = new SpellingResult(options.tokens);
IndexReader reader = determineReader(options.reader);
Term term = field != null ? new Term(field, "") : null;
@ -175,10 +184,16 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
term = term.createTerm(tokenText);
result.add(token, reader.docFreq(term));
int countLimit = Math.min(options.count, suggestions.length);
if(countLimit>0)
{
for (int i = 0; i < countLimit; i++) {
term = term.createTerm(suggestions[i]);
result.add(token, suggestions[i], reader.docFreq(term));
}
} else if(shardRequest) {
List<String> suggList = Collections.emptyList();
result.add(token, suggList);
}
} else {
if (suggestions.length > 0) {
List<String> suggList = Arrays.asList(suggestions);
@ -186,6 +201,9 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
suggList = suggList.subList(0, options.count);
}
result.add(token, suggList);
} else if(shardRequest) {
List<String> suggList = Collections.emptyList();
result.add(token, suggList);
}
}
}

View File

@ -1,8 +1,11 @@
package org.apache.solr.handler.component;
import java.io.File;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.util.AbstractSolrTestCase;
/**
* Test for SpellCheckComponent's distributed querying
@ -13,6 +16,12 @@ import org.apache.solr.common.params.ModifiableSolrParams;
*/
public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase {
public DistributedSpellCheckComponentTest()
{
//fixShardCount=true;
//shardCount=2;
}
private String saveProp;
@Override
public void setUp() throws Exception {
@ -49,6 +58,7 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
@Override
public void doTest() throws Exception {
del("*:*");
index(id, "1", "lowerfilt", "toyota");
index(id, "2", "lowerfilt", "chevrolet");
index(id, "3", "lowerfilt", "suzuki");
@ -60,6 +70,18 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy brown dogs.");
index(id, "10", "lowerfilt", "blue");
index(id, "12", "lowerfilt", "glue");
index(id, "13", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "14", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "15", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "16", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "17", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "18", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "19", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "20", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "21", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "22", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "23", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "24", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
commit();
handle.clear();