SOLR-2083: fix issue with misrreporting suggestions in distributed spell checking

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@995964 13f79535-47bb-0310-9956-ffa450edef68
2010-09-10 20:24:41 +00:00 · 2010-09-10 20:24:41 +00:00 · f6c0423c87
parent d38ec19a28
commit f6c0423c87
4 changed files with 89 additions and 15 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -475,7 +475,9 @@ Bug Fixes
  substitution/dereferencing.  Properly encode local params in
  distributed faceting.  (yonik)

-* SOLR-2114: Fixed parsing error in hsin function.  The function signature has changed slightly. (gsingers)  
+* SOLR-2114: Fixed parsing error in hsin function.  The function signature has changed slightly. (gsingers)
+
+* SOLR-2083: SpellCheckComponent misreports suggestions when distributed (James Dyer via gsingers)  


 Other Changes
--- a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
+++ b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
@ -46,6 +46,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.index.IndexReader;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ShardParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.SpellingParams;
 import org.apache.solr.common.util.NamedList;
@ -123,6 +124,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
    if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
      return;
    }
+    boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
    String q = params.get(SPELLCHECK_Q);
    SolrSpellChecker spellChecker = getSpellChecker(params);
    Collection<Token> tokens = null;
@ -147,13 +149,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
        IndexReader reader = rb.req.getSearcher().getReader();
        boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
        float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
-        SolrParams customParams = getCustomParams(getDictionaryName(params), params);
+        SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest);
        SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults,
                accuracy, customParams);
-
        SpellingResult spellingResult = spellChecker.getSuggestions(options);
        if (spellingResult != null) {
-          response.add("suggestions", toNamedList(spellingResult, q,
+          response.add("suggestions", toNamedList(shardRequest, spellingResult, q,
              extendedResults, collate));
          rb.rsp.add("spellcheck", response);
        }
@ -171,7 +172,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
   * @param params The original SolrParams
   * @return The new Params
   */
-  protected SolrParams getCustomParams(String dictionary, SolrParams params) {
+  protected SolrParams getCustomParams(String dictionary, SolrParams params, boolean shardRequest) {
    ModifiableSolrParams result = new ModifiableSolrParams();
    Iterator<String> iter = params.getParameterNamesIterator();
    String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + ".";
@ -181,6 +182,10 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
        result.add(nxt.substring(prefix.length()), params.getParams(nxt));
      }
    }
+    if(shardRequest)
+    {
+    	result.add(ShardParams.IS_SHARD, "true");
+    }
    return result;
  }

@ -243,17 +248,21 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
    Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
    // original token string -> summed up frequency
    Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
+    // original token string -> # of shards reporting it as misspelled
+    Map<String, Integer> origVsShards = new HashMap<String, Integer>();
    // original token string -> set of alternatives
    // must preserve order because collation algorithm can only work in-order
    Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
    // alternative string -> corresponding SuggestWord object
    Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
-
+    
+    int totalNumberShardResponses = 0;
    for (ShardRequest sreq : rb.finished) {
      for (ShardResponse srsp : sreq.responses) {
        NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
        LOG.info(srsp.getShard() + " " + nl);
        if (nl != null) {
+        	totalNumberShardResponses++;
          SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
          for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
            origVsSuggestion.put(suggestion.getToken(), suggestion);
@ -269,6 +278,14 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
            if (o != null)  origFreq += o;
            origFreq += suggestion.getOriginalFrequency();
            origVsFreq.put(suggestion.getToken(), origFreq);
+            
+            //# shards reporting
+            Integer origShards = origVsShards.get(suggestion.getToken());
+            if(origShards==null) {
+            	origVsShards.put(suggestion.getToken(), 1);
+            } else {
+            	origVsShards.put(suggestion.getToken(), ++origShards);
+            }            

            // find best suggestions
            for (int i = 0; i < suggestion.getNumFound(); i++) {
@ -296,6 +313,13 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
    SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand?
    for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet()) {
      String original = entry.getKey();
+      
+      //Only use this suggestion if all shards reported it as misspelled.
+      Integer numShards = origVsShards.get(original);
+      if(numShards<totalNumberShardResponses) {
+      	continue;
+      }
+      
      HashSet<String> suggested = entry.getValue();
      SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
      for (String suggestion : suggested) {
@ -335,7 +359,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
    }
    
    NamedList response = new SimpleOrderedMap();
-    response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate));
+    response.add("suggestions", toNamedList(false, result, origQuery, extendedResults, collate));
    rb.rsp.add("spellcheck", response);
  }

@ -383,7 +407,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
    return spellCheckers.get(name);
  }

-  protected NamedList toNamedList(SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
+  protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
    NamedList result = new NamedList();
    Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
    boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
@ -393,15 +417,23 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
      best = new LinkedHashMap<Token, String>(suggestions.size());
    }
    
+    int numSuggestions = 0;
+    for(LinkedHashMap<String, Integer> theSuggestion : suggestions.values())
+    {
+    	if(theSuggestion.size()>0)
+    	{
+    		numSuggestions++;
+    	}
+    }    
    // will be flipped to false if any of the suggestions are not in the index and hasFreqInfo is true
-    if(suggestions.size() > 0) {
+    if(numSuggestions > 0) {
      isCorrectlySpelled = true;
    }
    
    for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
      Token inputToken = entry.getKey();
      Map<String, Integer> theSuggestions = entry.getValue();
-      if (theSuggestions != null && theSuggestions.size() > 0) {
+      if (theSuggestions != null && (theSuggestions.size()>0 || shardRequest)) {
        SimpleOrderedMap suggestionList = new SimpleOrderedMap();
        suggestionList.add("numFound", theSuggestions.size());
        suggestionList.add("startOffset", inputToken.startOffset());
@ -430,7 +462,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
          suggestionList.add("suggestion", theSuggestions.keySet());
        }

-        if (collate == true ){//set aside the best suggestion for this token
+        if (collate == true && theSuggestions.size()>0){//set aside the best suggestion for this token
          best.put(inputToken, theSuggestions.keySet().iterator().next());
        }
        if (hasFreqInfo) {
--- a/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
+++ b/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
@ -22,6 +22,7 @@ import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;

@ -41,6 +42,8 @@ import org.apache.lucene.search.spell.SpellChecker;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.RAMDirectory;
+import org.apache.solr.common.params.ShardParams;
+import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.FieldType;
@ -153,6 +156,12 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
  
  @Override
  public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
+  	boolean shardRequest = false;
+  	SolrParams params = options.customParams;
+  	if(params!=null)
+  	{
+  		shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
+  	}
    SpellingResult result = new SpellingResult(options.tokens);
    IndexReader reader = determineReader(options.reader);
    Term term = field != null ? new Term(field, "") : null;
@ -167,7 +176,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
            field,
            options.onlyMorePopular, theAccuracy);
      if (suggestions.length == 1 && suggestions[0].equals(tokenText)) {
-        //These are spelled the same, continue on
+      	//These are spelled the same, continue on
        continue;
      }

@ -175,9 +184,15 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
        term = term.createTerm(tokenText);
        result.add(token, reader.docFreq(term));
        int countLimit = Math.min(options.count, suggestions.length);
-        for (int i = 0; i < countLimit; i++) {
-          term = term.createTerm(suggestions[i]);
-          result.add(token, suggestions[i], reader.docFreq(term));
+        if(countLimit>0)
+        {
+	        for (int i = 0; i < countLimit; i++) {
+	          term = term.createTerm(suggestions[i]);
+	          result.add(token, suggestions[i], reader.docFreq(term));
+	        }
+        } else if(shardRequest) {
+        	List<String> suggList = Collections.emptyList();
+        	result.add(token, suggList);
        }
      } else {
        if (suggestions.length > 0) {
@ -186,6 +201,9 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
            suggList = suggList.subList(0, options.count);
          }
          result.add(token, suggList);
+        } else if(shardRequest) {
+        	List<String> suggList = Collections.emptyList();
+        	result.add(token, suggList);
        }
      }
    }
--- a/solr/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
+++ b/solr/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
@ -1,8 +1,11 @@
 package org.apache.solr.handler.component;

+import java.io.File;
+
 import org.apache.solr.BaseDistributedSearchTestCase;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.util.AbstractSolrTestCase;

 /**
 * Test for SpellCheckComponent's distributed querying
@ -13,6 +16,12 @@ import org.apache.solr.common.params.ModifiableSolrParams;
 */
 public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase {
  
+	public DistributedSpellCheckComponentTest()
+	{
+		//fixShardCount=true;
+		//shardCount=2;
+	}
+	
  private String saveProp;
  @Override
  public void setUp() throws Exception {
@ -49,6 +58,7 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
  
  @Override
  public void doTest() throws Exception {
+  	del("*:*");
    index(id, "1", "lowerfilt", "toyota");
    index(id, "2", "lowerfilt", "chevrolet");
    index(id, "3", "lowerfilt", "suzuki");
@ -60,6 +70,18 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
    index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy brown dogs.");
    index(id, "10", "lowerfilt", "blue");
    index(id, "12", "lowerfilt", "glue");
+    index(id, "13", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "14", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "15", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "16", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "17", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "18", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "19", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "20", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "21", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "22", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "23", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
+    index(id, "24", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
    commit();

    handle.clear();