mirror of https://github.com/apache/lucene.git
SOLR-2083: fix issue with misrreporting suggestions in distributed spell checking
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@995964 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d38ec19a28
commit
f6c0423c87
|
@ -477,6 +477,8 @@ Bug Fixes
|
|||
|
||||
* SOLR-2114: Fixed parsing error in hsin function. The function signature has changed slightly. (gsingers)
|
||||
|
||||
* SOLR-2083: SpellCheckComponent misreports suggestions when distributed (James Dyer via gsingers)
|
||||
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.ShardParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.SpellingParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
@ -123,6 +124,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
|
||||
String q = params.get(SPELLCHECK_Q);
|
||||
SolrSpellChecker spellChecker = getSpellChecker(params);
|
||||
Collection<Token> tokens = null;
|
||||
|
@ -147,13 +149,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
IndexReader reader = rb.req.getSearcher().getReader();
|
||||
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
|
||||
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
|
||||
SolrParams customParams = getCustomParams(getDictionaryName(params), params);
|
||||
SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest);
|
||||
SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults,
|
||||
accuracy, customParams);
|
||||
|
||||
SpellingResult spellingResult = spellChecker.getSuggestions(options);
|
||||
if (spellingResult != null) {
|
||||
response.add("suggestions", toNamedList(spellingResult, q,
|
||||
response.add("suggestions", toNamedList(shardRequest, spellingResult, q,
|
||||
extendedResults, collate));
|
||||
rb.rsp.add("spellcheck", response);
|
||||
}
|
||||
|
@ -171,7 +172,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
* @param params The original SolrParams
|
||||
* @return The new Params
|
||||
*/
|
||||
protected SolrParams getCustomParams(String dictionary, SolrParams params) {
|
||||
protected SolrParams getCustomParams(String dictionary, SolrParams params, boolean shardRequest) {
|
||||
ModifiableSolrParams result = new ModifiableSolrParams();
|
||||
Iterator<String> iter = params.getParameterNamesIterator();
|
||||
String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + ".";
|
||||
|
@ -181,6 +182,10 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
result.add(nxt.substring(prefix.length()), params.getParams(nxt));
|
||||
}
|
||||
}
|
||||
if(shardRequest)
|
||||
{
|
||||
result.add(ShardParams.IS_SHARD, "true");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -243,17 +248,21 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
|
||||
// original token string -> summed up frequency
|
||||
Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
|
||||
// original token string -> # of shards reporting it as misspelled
|
||||
Map<String, Integer> origVsShards = new HashMap<String, Integer>();
|
||||
// original token string -> set of alternatives
|
||||
// must preserve order because collation algorithm can only work in-order
|
||||
Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
|
||||
// alternative string -> corresponding SuggestWord object
|
||||
Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
|
||||
|
||||
int totalNumberShardResponses = 0;
|
||||
for (ShardRequest sreq : rb.finished) {
|
||||
for (ShardResponse srsp : sreq.responses) {
|
||||
NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
|
||||
LOG.info(srsp.getShard() + " " + nl);
|
||||
if (nl != null) {
|
||||
totalNumberShardResponses++;
|
||||
SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
|
||||
for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
|
||||
origVsSuggestion.put(suggestion.getToken(), suggestion);
|
||||
|
@ -270,6 +279,14 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
origFreq += suggestion.getOriginalFrequency();
|
||||
origVsFreq.put(suggestion.getToken(), origFreq);
|
||||
|
||||
//# shards reporting
|
||||
Integer origShards = origVsShards.get(suggestion.getToken());
|
||||
if(origShards==null) {
|
||||
origVsShards.put(suggestion.getToken(), 1);
|
||||
} else {
|
||||
origVsShards.put(suggestion.getToken(), ++origShards);
|
||||
}
|
||||
|
||||
// find best suggestions
|
||||
for (int i = 0; i < suggestion.getNumFound(); i++) {
|
||||
String alternative = suggestion.getAlternatives().get(i);
|
||||
|
@ -296,6 +313,13 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand?
|
||||
for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet()) {
|
||||
String original = entry.getKey();
|
||||
|
||||
//Only use this suggestion if all shards reported it as misspelled.
|
||||
Integer numShards = origVsShards.get(original);
|
||||
if(numShards<totalNumberShardResponses) {
|
||||
continue;
|
||||
}
|
||||
|
||||
HashSet<String> suggested = entry.getValue();
|
||||
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
|
||||
for (String suggestion : suggested) {
|
||||
|
@ -335,7 +359,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
}
|
||||
|
||||
NamedList response = new SimpleOrderedMap();
|
||||
response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate));
|
||||
response.add("suggestions", toNamedList(false, result, origQuery, extendedResults, collate));
|
||||
rb.rsp.add("spellcheck", response);
|
||||
}
|
||||
|
||||
|
@ -383,7 +407,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
return spellCheckers.get(name);
|
||||
}
|
||||
|
||||
protected NamedList toNamedList(SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
|
||||
protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
|
||||
NamedList result = new NamedList();
|
||||
Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
|
||||
boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
|
||||
|
@ -393,15 +417,23 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
best = new LinkedHashMap<Token, String>(suggestions.size());
|
||||
}
|
||||
|
||||
int numSuggestions = 0;
|
||||
for(LinkedHashMap<String, Integer> theSuggestion : suggestions.values())
|
||||
{
|
||||
if(theSuggestion.size()>0)
|
||||
{
|
||||
numSuggestions++;
|
||||
}
|
||||
}
|
||||
// will be flipped to false if any of the suggestions are not in the index and hasFreqInfo is true
|
||||
if(suggestions.size() > 0) {
|
||||
if(numSuggestions > 0) {
|
||||
isCorrectlySpelled = true;
|
||||
}
|
||||
|
||||
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
|
||||
Token inputToken = entry.getKey();
|
||||
Map<String, Integer> theSuggestions = entry.getValue();
|
||||
if (theSuggestions != null && theSuggestions.size() > 0) {
|
||||
if (theSuggestions != null && (theSuggestions.size()>0 || shardRequest)) {
|
||||
SimpleOrderedMap suggestionList = new SimpleOrderedMap();
|
||||
suggestionList.add("numFound", theSuggestions.size());
|
||||
suggestionList.add("startOffset", inputToken.startOffset());
|
||||
|
@ -430,7 +462,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
suggestionList.add("suggestion", theSuggestions.keySet());
|
||||
}
|
||||
|
||||
if (collate == true ){//set aside the best suggestion for this token
|
||||
if (collate == true && theSuggestions.size()>0){//set aside the best suggestion for this token
|
||||
best.put(inputToken, theSuggestions.keySet().iterator().next());
|
||||
}
|
||||
if (hasFreqInfo) {
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -41,6 +42,8 @@ import org.apache.lucene.search.spell.SpellChecker;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.solr.common.params.ShardParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
|
@ -153,6 +156,12 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
|
||||
@Override
|
||||
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
|
||||
boolean shardRequest = false;
|
||||
SolrParams params = options.customParams;
|
||||
if(params!=null)
|
||||
{
|
||||
shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
|
||||
}
|
||||
SpellingResult result = new SpellingResult(options.tokens);
|
||||
IndexReader reader = determineReader(options.reader);
|
||||
Term term = field != null ? new Term(field, "") : null;
|
||||
|
@ -167,7 +176,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
field,
|
||||
options.onlyMorePopular, theAccuracy);
|
||||
if (suggestions.length == 1 && suggestions[0].equals(tokenText)) {
|
||||
//These are spelled the same, continue on
|
||||
//These are spelled the same, continue on
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -175,9 +184,15 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
term = term.createTerm(tokenText);
|
||||
result.add(token, reader.docFreq(term));
|
||||
int countLimit = Math.min(options.count, suggestions.length);
|
||||
for (int i = 0; i < countLimit; i++) {
|
||||
term = term.createTerm(suggestions[i]);
|
||||
result.add(token, suggestions[i], reader.docFreq(term));
|
||||
if(countLimit>0)
|
||||
{
|
||||
for (int i = 0; i < countLimit; i++) {
|
||||
term = term.createTerm(suggestions[i]);
|
||||
result.add(token, suggestions[i], reader.docFreq(term));
|
||||
}
|
||||
} else if(shardRequest) {
|
||||
List<String> suggList = Collections.emptyList();
|
||||
result.add(token, suggList);
|
||||
}
|
||||
} else {
|
||||
if (suggestions.length > 0) {
|
||||
|
@ -186,6 +201,9 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
suggList = suggList.subList(0, options.count);
|
||||
}
|
||||
result.add(token, suggList);
|
||||
} else if(shardRequest) {
|
||||
List<String> suggList = Collections.emptyList();
|
||||
result.add(token, suggList);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
package org.apache.solr.handler.component;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.solr.BaseDistributedSearchTestCase;
|
||||
import org.apache.solr.client.solrj.SolrServer;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
|
||||
/**
|
||||
* Test for SpellCheckComponent's distributed querying
|
||||
|
@ -13,6 +16,12 @@ import org.apache.solr.common.params.ModifiableSolrParams;
|
|||
*/
|
||||
public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase {
|
||||
|
||||
public DistributedSpellCheckComponentTest()
|
||||
{
|
||||
//fixShardCount=true;
|
||||
//shardCount=2;
|
||||
}
|
||||
|
||||
private String saveProp;
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
|
@ -49,6 +58,7 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
|
|||
|
||||
@Override
|
||||
public void doTest() throws Exception {
|
||||
del("*:*");
|
||||
index(id, "1", "lowerfilt", "toyota");
|
||||
index(id, "2", "lowerfilt", "chevrolet");
|
||||
index(id, "3", "lowerfilt", "suzuki");
|
||||
|
@ -60,6 +70,18 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
|
|||
index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy brown dogs.");
|
||||
index(id, "10", "lowerfilt", "blue");
|
||||
index(id, "12", "lowerfilt", "glue");
|
||||
index(id, "13", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "14", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "15", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "16", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "17", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "18", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "19", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "20", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "21", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "22", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "23", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
index(id, "24", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
|
||||
commit();
|
||||
|
||||
handle.clear();
|
||||
|
|
Loading…
Reference in New Issue