From 9b9ac66574f5d9a3b900917b0f05f17bdb8db633 Mon Sep 17 00:00:00 2001 From: Shalin Shekhar Mangar Date: Wed, 9 Dec 2009 13:22:52 +0000 Subject: [PATCH] SOLR-785 -- Distributed Search support for SpellCheckComponent git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@888796 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../component/SpellCheckComponent.java | 224 +++++++++++++++++- .../spelling/AbstractLuceneSpellChecker.java | 7 +- .../DistributedSpellCheckComponentTest.java | 39 +++ 4 files changed, 265 insertions(+), 8 deletions(-) create mode 100644 src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java diff --git a/CHANGES.txt b/CHANGES.txt index cfbb76dbada..189fb81bcc2 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -53,6 +53,9 @@ New Features * SOLR-1571: Added unicode collation support though Lucene's CollationKeyFilter (Robert Muir via shalin) +* SOLR-785: Distributed Search support for SpellCheckComponent + (Matthew Woytowitz, shalin) + Optimizations ---------------------- diff --git a/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/src/java/org/apache/solr/handler/component/SpellCheckComponent.java index f80df573bca..4eaaa7474ef 100644 --- a/src/java/org/apache/solr/handler/component/SpellCheckComponent.java +++ b/src/java/org/apache/solr/handler/component/SpellCheckComponent.java @@ -19,14 +19,13 @@ package org.apache.solr.handler.component; import java.io.IOException; import java.io.StringReader; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Collections; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; + +import org.apache.lucene.search.spell.LevensteinDistance; +import org.apache.lucene.search.spell.StringDistance; +import org.apache.lucene.util.PriorityQueue; +import org.apache.solr.client.solrj.response.SpellCheckResponse; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -152,6 +151,217 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar } } + static class SuggestWordQueue extends PriorityQueue { + SuggestWordQueue(int size) { + initialize(size); + } + + @Override + protected boolean lessThan(Object a, Object b) { + SuggestWord wa = (SuggestWord) a; + SuggestWord wb = (SuggestWord) b; + int val = wa.compareTo(wb); + return val < 0; + } + } + + /** + * Borrowed from Lucene SpellChecker + */ + static class SuggestWord { + /** + * the score of the word + */ + public float score; + + /** + * The freq of the word + */ + public int freq; + + /** + * the suggested word + */ + public String string; + + public final int compareTo(SuggestWord a) { + // first criteria: the edit distance + if (score > a.score) { + return 1; + } + if (score < a.score) { + return -1; + } + + // second criteria (if first criteria is equal): the popularity + if (freq > a.freq) { + return 1; + } + + if (freq < a.freq) { + return -1; + } + return 0; + } + } + + @Override + public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { + SolrParams params = rb.req.getParams(); + // Turn on spellcheck only only when retrieving fields + if (!params.getBool(COMPONENT_NAME, false)) return; + if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) { + // fetch at least 5 suggestions from each shard + int count = sreq.params.getInt(SPELLCHECK_COUNT, 1); + if (count < 5) count = 5; + sreq.params.set(SPELLCHECK_COUNT, count); + sreq.params.set("spellcheck", "true"); + } else { + sreq.params.set("spellcheck", "false"); + } + } + + @Override + @SuppressWarnings({"unchecked", "deprecation"}) + public void finishStage(ResponseBuilder rb) { + SolrParams params = rb.req.getParams(); + if (!params.getBool(COMPONENT_NAME, false) || rb.stage != ResponseBuilder.STAGE_GET_FIELDS) + return; + + boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false); + boolean collate = params.getBool(SPELLCHECK_COLLATE, false); + + String origQuery = params.get(SPELLCHECK_Q); + if (origQuery == null) { + origQuery = rb.getQueryString(); + if (origQuery == null) { + origQuery = params.get(CommonParams.Q); + } + } + + int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1); + float min = 0.5f; + StringDistance sd = null; + int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT); + SolrSpellChecker checker = getSpellChecker(rb.req.getParams()); + if (checker instanceof AbstractLuceneSpellChecker) { + AbstractLuceneSpellChecker spellChecker = (AbstractLuceneSpellChecker) checker; + min = spellChecker.getAccuracy(); + sd = spellChecker.getStringDistance(); + } + if (sd == null) + sd = new LevensteinDistance(); + + Collection tokens = null; + try { + tokens = getTokens(origQuery, checker.getQueryAnalyzer()); + } catch (IOException e) { + LOG.error("Could not get tokens (this should never happen)", e); + } + + // original token -> corresponding Suggestion object (keep track of start,end) + Map origVsSuggestion = new HashMap(); + // original token string -> summed up frequency + Map origVsFreq = new HashMap(); + // original token string -> set of alternatives + // must preserve order because collation algorithm can only work in-order + Map> origVsSuggested = new LinkedHashMap>(); + // alternative string -> corresponding SuggestWord object + Map suggestedVsWord = new HashMap(); + + for (ShardRequest sreq : rb.finished) { + for (ShardResponse srsp : sreq.responses) { + NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck"); + LOG.info(srsp.getShard() + " " + nl); + if (nl != null) { + SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl); + for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) { + origVsSuggestion.put(suggestion.getToken(), suggestion); + HashSet suggested = origVsSuggested.get(suggestion.getToken()); + if (suggested == null) { + suggested = new HashSet(); + origVsSuggested.put(suggestion.getToken(), suggested); + } + + // sum up original frequency + int origFreq = 0; + Integer o = origVsFreq.get(suggestion.getToken()); + if (o != null) origFreq += o; + origFreq += suggestion.getOriginalFrequency(); + origVsFreq.put(suggestion.getToken(), origFreq); + + // find best suggestions + for (int i = 0; i < suggestion.getNumFound(); i++) { + String alternative = suggestion.getAlternatives().get(i); + suggested.add(alternative); + SuggestWord sug = suggestedVsWord.get(alternative); + if (sug == null) { + sug = new SuggestWord(); + suggestedVsWord.put(alternative, sug); + } + sug.string = alternative; + // alternative frequency is present only for extendedResults=true + if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) { + Integer freq = suggestion.getAlternativeFrequencies().get(i); + if (freq != null) sug.freq += freq; + } + } + } + } + } + } + + // all shard responses have been collected + // create token and get top suggestions + SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand? + for (Map.Entry> entry : origVsSuggested.entrySet()) { + String original = entry.getKey(); + HashSet suggested = entry.getValue(); + SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); + for (String suggestion : suggested) { + SuggestWord sug = suggestedVsWord.get(suggestion); + sug.score = sd.getDistance(original, sug.string); + if (sug.score < min) continue; + sugQueue.insertWithOverflow(sug); + if (sugQueue.size() == numSug) { + // if queue full, maintain the minScore score + min = ((SuggestWord) sugQueue.top()).score; + } + } + + // create token + SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original); + Token token = new Token(); + token.setTermText(original); + token.setStartOffset(suggestion.getStartOffset()); + token.setEndOffset(suggestion.getEndOffset()); + + // get top 'count' suggestions out of 'sugQueue.size()' candidates + SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())]; + // skip the first sugQueue.size() - count elements + for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop(); + // now collect the top 'count' responses + for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) { + suggestions[k] = ((SuggestWord) sugQueue.pop()); + } + + if (extendedResults) { + Integer o = origVsFreq.get(original); + if (o != null) result.add(token, o); + for (SuggestWord word : suggestions) + result.add(token, word.string, word.freq); + } else { + List words = new ArrayList(sugQueue.size()); + for (SuggestWord word : suggestions) words.add(word.string); + result.add(token, words); + } + } + + NamedList response = new SimpleOrderedMap(); + response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate)); + rb.rsp.add("spellcheck", response); + } + private Collection getTokens(String q, Analyzer analyzer) throws IOException { Collection result = new ArrayList(); Token token = null; diff --git a/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java b/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java index deda26d469f..bb8c8c63241 100644 --- a/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java +++ b/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java @@ -77,6 +77,8 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { protected float accuracy = 0.5f; public static final String FIELD = "field"; + protected StringDistance sd; + public String init(NamedList config, SolrCore core) { super.init(config, core); indexDir = (String) config.get(INDEX_DIR); @@ -90,7 +92,6 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { sourceLocation = (String) config.get(LOCATION); field = (String) config.get(FIELD); String strDistanceName = (String)config.get(STRING_DISTANCE); - StringDistance sd = null; if (strDistanceName != null) { sd = (StringDistance) core.getResourceLoader().newInstance(strDistanceName); //TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils... @@ -226,4 +227,8 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { public String getSourceLocation() { return sourceLocation; } + + public StringDistance getStringDistance() { + return sd; + } } diff --git a/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java b/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java new file mode 100644 index 00000000000..6d3ea770a16 --- /dev/null +++ b/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java @@ -0,0 +1,39 @@ +package org.apache.solr.handler.component; + +import org.apache.solr.BaseDistributedSearchTestCase; + +/** + * Test for SpellCheckComponent's distributed querying + * + * @since solr 1.5 + * @version $Id$ + * @see org.apache.solr.handler.component.SpellCheckComponent + */ +public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase { + + @Override + public void doTest() throws Exception { + index(id, "1", "lowerfilt", "toyota"); + index(id, "2", "lowerfilt", "chevrolet"); + index(id, "3", "lowerfilt", "suzuki"); + index(id, "4", "lowerfilt", "ford"); + index(id, "5", "lowerfilt", "ferrari"); + index(id, "6", "lowerfilt", "jaguar"); + index(id, "7", "lowerfilt", "mclaren"); + index(id, "8", "lowerfilt", "sonata"); + index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy brown dogs."); + index(id, "10", "lowerfilt", "blue"); + index(id, "12", "lowerfilt", "glue"); + commit(); + + handle.clear(); + handle.put("QTime", SKIPVAL); + handle.put("timestamp", SKIPVAL); + handle.put("maxScore", SKIPVAL); + + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH"); + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true"); + query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4"); + query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true"); + } +}