mirror of https://github.com/apache/lucene.git
SOLR-785 -- Distributed Search support for SpellCheckComponent
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@888796 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b91fb7f58c
commit
9b9ac66574
|
@ -53,6 +53,9 @@ New Features
|
||||||
* SOLR-1571: Added unicode collation support though Lucene's CollationKeyFilter
|
* SOLR-1571: Added unicode collation support though Lucene's CollationKeyFilter
|
||||||
(Robert Muir via shalin)
|
(Robert Muir via shalin)
|
||||||
|
|
||||||
|
* SOLR-785: Distributed Search support for SpellCheckComponent
|
||||||
|
(Matthew Woytowitz, shalin)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -19,14 +19,13 @@ package org.apache.solr.handler.component;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedHashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.spell.LevensteinDistance;
|
||||||
|
import org.apache.lucene.search.spell.StringDistance;
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
import org.apache.solr.client.solrj.response.SpellCheckResponse;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -152,6 +151,217 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class SuggestWordQueue extends PriorityQueue {
|
||||||
|
SuggestWordQueue(int size) {
|
||||||
|
initialize(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean lessThan(Object a, Object b) {
|
||||||
|
SuggestWord wa = (SuggestWord) a;
|
||||||
|
SuggestWord wb = (SuggestWord) b;
|
||||||
|
int val = wa.compareTo(wb);
|
||||||
|
return val < 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Borrowed from Lucene SpellChecker
|
||||||
|
*/
|
||||||
|
static class SuggestWord {
|
||||||
|
/**
|
||||||
|
* the score of the word
|
||||||
|
*/
|
||||||
|
public float score;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The freq of the word
|
||||||
|
*/
|
||||||
|
public int freq;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the suggested word
|
||||||
|
*/
|
||||||
|
public String string;
|
||||||
|
|
||||||
|
public final int compareTo(SuggestWord a) {
|
||||||
|
// first criteria: the edit distance
|
||||||
|
if (score > a.score) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (score < a.score) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// second criteria (if first criteria is equal): the popularity
|
||||||
|
if (freq > a.freq) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (freq < a.freq) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
|
||||||
|
SolrParams params = rb.req.getParams();
|
||||||
|
// Turn on spellcheck only only when retrieving fields
|
||||||
|
if (!params.getBool(COMPONENT_NAME, false)) return;
|
||||||
|
if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
|
||||||
|
// fetch at least 5 suggestions from each shard
|
||||||
|
int count = sreq.params.getInt(SPELLCHECK_COUNT, 1);
|
||||||
|
if (count < 5) count = 5;
|
||||||
|
sreq.params.set(SPELLCHECK_COUNT, count);
|
||||||
|
sreq.params.set("spellcheck", "true");
|
||||||
|
} else {
|
||||||
|
sreq.params.set("spellcheck", "false");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SuppressWarnings({"unchecked", "deprecation"})
|
||||||
|
public void finishStage(ResponseBuilder rb) {
|
||||||
|
SolrParams params = rb.req.getParams();
|
||||||
|
if (!params.getBool(COMPONENT_NAME, false) || rb.stage != ResponseBuilder.STAGE_GET_FIELDS)
|
||||||
|
return;
|
||||||
|
|
||||||
|
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false);
|
||||||
|
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
|
||||||
|
|
||||||
|
String origQuery = params.get(SPELLCHECK_Q);
|
||||||
|
if (origQuery == null) {
|
||||||
|
origQuery = rb.getQueryString();
|
||||||
|
if (origQuery == null) {
|
||||||
|
origQuery = params.get(CommonParams.Q);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1);
|
||||||
|
float min = 0.5f;
|
||||||
|
StringDistance sd = null;
|
||||||
|
int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
|
||||||
|
SolrSpellChecker checker = getSpellChecker(rb.req.getParams());
|
||||||
|
if (checker instanceof AbstractLuceneSpellChecker) {
|
||||||
|
AbstractLuceneSpellChecker spellChecker = (AbstractLuceneSpellChecker) checker;
|
||||||
|
min = spellChecker.getAccuracy();
|
||||||
|
sd = spellChecker.getStringDistance();
|
||||||
|
}
|
||||||
|
if (sd == null)
|
||||||
|
sd = new LevensteinDistance();
|
||||||
|
|
||||||
|
Collection<Token> tokens = null;
|
||||||
|
try {
|
||||||
|
tokens = getTokens(origQuery, checker.getQueryAnalyzer());
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.error("Could not get tokens (this should never happen)", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// original token -> corresponding Suggestion object (keep track of start,end)
|
||||||
|
Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
|
||||||
|
// original token string -> summed up frequency
|
||||||
|
Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
|
||||||
|
// original token string -> set of alternatives
|
||||||
|
// must preserve order because collation algorithm can only work in-order
|
||||||
|
Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
|
||||||
|
// alternative string -> corresponding SuggestWord object
|
||||||
|
Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
|
||||||
|
|
||||||
|
for (ShardRequest sreq : rb.finished) {
|
||||||
|
for (ShardResponse srsp : sreq.responses) {
|
||||||
|
NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
|
||||||
|
LOG.info(srsp.getShard() + " " + nl);
|
||||||
|
if (nl != null) {
|
||||||
|
SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
|
||||||
|
for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
|
||||||
|
origVsSuggestion.put(suggestion.getToken(), suggestion);
|
||||||
|
HashSet<String> suggested = origVsSuggested.get(suggestion.getToken());
|
||||||
|
if (suggested == null) {
|
||||||
|
suggested = new HashSet<String>();
|
||||||
|
origVsSuggested.put(suggestion.getToken(), suggested);
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up original frequency
|
||||||
|
int origFreq = 0;
|
||||||
|
Integer o = origVsFreq.get(suggestion.getToken());
|
||||||
|
if (o != null) origFreq += o;
|
||||||
|
origFreq += suggestion.getOriginalFrequency();
|
||||||
|
origVsFreq.put(suggestion.getToken(), origFreq);
|
||||||
|
|
||||||
|
// find best suggestions
|
||||||
|
for (int i = 0; i < suggestion.getNumFound(); i++) {
|
||||||
|
String alternative = suggestion.getAlternatives().get(i);
|
||||||
|
suggested.add(alternative);
|
||||||
|
SuggestWord sug = suggestedVsWord.get(alternative);
|
||||||
|
if (sug == null) {
|
||||||
|
sug = new SuggestWord();
|
||||||
|
suggestedVsWord.put(alternative, sug);
|
||||||
|
}
|
||||||
|
sug.string = alternative;
|
||||||
|
// alternative frequency is present only for extendedResults=true
|
||||||
|
if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) {
|
||||||
|
Integer freq = suggestion.getAlternativeFrequencies().get(i);
|
||||||
|
if (freq != null) sug.freq += freq;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// all shard responses have been collected
|
||||||
|
// create token and get top suggestions
|
||||||
|
SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand?
|
||||||
|
for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet()) {
|
||||||
|
String original = entry.getKey();
|
||||||
|
HashSet<String> suggested = entry.getValue();
|
||||||
|
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
|
||||||
|
for (String suggestion : suggested) {
|
||||||
|
SuggestWord sug = suggestedVsWord.get(suggestion);
|
||||||
|
sug.score = sd.getDistance(original, sug.string);
|
||||||
|
if (sug.score < min) continue;
|
||||||
|
sugQueue.insertWithOverflow(sug);
|
||||||
|
if (sugQueue.size() == numSug) {
|
||||||
|
// if queue full, maintain the minScore score
|
||||||
|
min = ((SuggestWord) sugQueue.top()).score;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// create token
|
||||||
|
SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
|
||||||
|
Token token = new Token();
|
||||||
|
token.setTermText(original);
|
||||||
|
token.setStartOffset(suggestion.getStartOffset());
|
||||||
|
token.setEndOffset(suggestion.getEndOffset());
|
||||||
|
|
||||||
|
// get top 'count' suggestions out of 'sugQueue.size()' candidates
|
||||||
|
SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
|
||||||
|
// skip the first sugQueue.size() - count elements
|
||||||
|
for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop();
|
||||||
|
// now collect the top 'count' responses
|
||||||
|
for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
|
||||||
|
suggestions[k] = ((SuggestWord) sugQueue.pop());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (extendedResults) {
|
||||||
|
Integer o = origVsFreq.get(original);
|
||||||
|
if (o != null) result.add(token, o);
|
||||||
|
for (SuggestWord word : suggestions)
|
||||||
|
result.add(token, word.string, word.freq);
|
||||||
|
} else {
|
||||||
|
List<String> words = new ArrayList<String>(sugQueue.size());
|
||||||
|
for (SuggestWord word : suggestions) words.add(word.string);
|
||||||
|
result.add(token, words);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NamedList response = new SimpleOrderedMap();
|
||||||
|
response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate));
|
||||||
|
rb.rsp.add("spellcheck", response);
|
||||||
|
}
|
||||||
|
|
||||||
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
||||||
Collection<Token> result = new ArrayList<Token>();
|
Collection<Token> result = new ArrayList<Token>();
|
||||||
Token token = null;
|
Token token = null;
|
||||||
|
|
|
@ -77,6 +77,8 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
||||||
protected float accuracy = 0.5f;
|
protected float accuracy = 0.5f;
|
||||||
public static final String FIELD = "field";
|
public static final String FIELD = "field";
|
||||||
|
|
||||||
|
protected StringDistance sd;
|
||||||
|
|
||||||
public String init(NamedList config, SolrCore core) {
|
public String init(NamedList config, SolrCore core) {
|
||||||
super.init(config, core);
|
super.init(config, core);
|
||||||
indexDir = (String) config.get(INDEX_DIR);
|
indexDir = (String) config.get(INDEX_DIR);
|
||||||
|
@ -90,7 +92,6 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
||||||
sourceLocation = (String) config.get(LOCATION);
|
sourceLocation = (String) config.get(LOCATION);
|
||||||
field = (String) config.get(FIELD);
|
field = (String) config.get(FIELD);
|
||||||
String strDistanceName = (String)config.get(STRING_DISTANCE);
|
String strDistanceName = (String)config.get(STRING_DISTANCE);
|
||||||
StringDistance sd = null;
|
|
||||||
if (strDistanceName != null) {
|
if (strDistanceName != null) {
|
||||||
sd = (StringDistance) core.getResourceLoader().newInstance(strDistanceName);
|
sd = (StringDistance) core.getResourceLoader().newInstance(strDistanceName);
|
||||||
//TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils...
|
//TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils...
|
||||||
|
@ -226,4 +227,8 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
||||||
public String getSourceLocation() {
|
public String getSourceLocation() {
|
||||||
return sourceLocation;
|
return sourceLocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public StringDistance getStringDistance() {
|
||||||
|
return sd;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
package org.apache.solr.handler.component;
|
||||||
|
|
||||||
|
import org.apache.solr.BaseDistributedSearchTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test for SpellCheckComponent's distributed querying
|
||||||
|
*
|
||||||
|
* @since solr 1.5
|
||||||
|
* @version $Id$
|
||||||
|
* @see org.apache.solr.handler.component.SpellCheckComponent
|
||||||
|
*/
|
||||||
|
public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doTest() throws Exception {
|
||||||
|
index(id, "1", "lowerfilt", "toyota");
|
||||||
|
index(id, "2", "lowerfilt", "chevrolet");
|
||||||
|
index(id, "3", "lowerfilt", "suzuki");
|
||||||
|
index(id, "4", "lowerfilt", "ford");
|
||||||
|
index(id, "5", "lowerfilt", "ferrari");
|
||||||
|
index(id, "6", "lowerfilt", "jaguar");
|
||||||
|
index(id, "7", "lowerfilt", "mclaren");
|
||||||
|
index(id, "8", "lowerfilt", "sonata");
|
||||||
|
index(id, "9", "lowerfilt", "The quick red fox jumped over the lazy brown dogs.");
|
||||||
|
index(id, "10", "lowerfilt", "blue");
|
||||||
|
index(id, "12", "lowerfilt", "glue");
|
||||||
|
commit();
|
||||||
|
|
||||||
|
handle.clear();
|
||||||
|
handle.put("QTime", SKIPVAL);
|
||||||
|
handle.put("timestamp", SKIPVAL);
|
||||||
|
handle.put("maxScore", SKIPVAL);
|
||||||
|
|
||||||
|
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH");
|
||||||
|
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true");
|
||||||
|
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4");
|
||||||
|
query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true");
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue