mirror of https://github.com/apache/lucene.git
SOLR-2848: generalize distributed spellcheck code to work with any SolrSpellChecker
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1200266 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ee293e7e7d
commit
b97d321f82
|
@ -267,6 +267,10 @@ Bug Fixes
|
|||
* SOLR-2829: Fix problem with false-positives due to incorrect
|
||||
equals methods. (Yonik Seeley, Hossman, Erick Erickson.
|
||||
Marc Tinnemeyer caught the bug)
|
||||
|
||||
* SOLR-2848: Removed 'instanceof AbstractLuceneSpellChecker' hacks from distributed spellchecking code,
|
||||
and added a merge() method to SolrSpellChecker instead. Previously if you extended SolrSpellChecker
|
||||
your spellchecker would not work in distributed fashion. (James Dyer via rmuir)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.io.StringReader;
|
|||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import org.apache.lucene.search.spell.DirectSpellChecker;
|
||||
import org.apache.lucene.search.spell.JaroWinklerDistance;
|
||||
import org.apache.lucene.search.spell.LevensteinDistance;
|
||||
import org.apache.lucene.search.spell.StringDistance;
|
||||
import org.apache.lucene.search.spell.SuggestWord;
|
||||
|
@ -147,7 +149,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
IndexReader reader = rb.req.getSearcher().getIndexReader();
|
||||
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
|
||||
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
|
||||
SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest);
|
||||
SolrParams customParams = getCustomParams(getDictionaryName(params), params);
|
||||
SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults,
|
||||
accuracy, customParams);
|
||||
SpellingResult spellingResult = spellChecker.getSuggestions(options);
|
||||
|
@ -210,7 +212,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
* @param params The original SolrParams
|
||||
* @return The new Params
|
||||
*/
|
||||
protected SolrParams getCustomParams(String dictionary, SolrParams params, boolean shardRequest) {
|
||||
protected SolrParams getCustomParams(String dictionary, SolrParams params) {
|
||||
ModifiableSolrParams result = new ModifiableSolrParams();
|
||||
Iterator<String> iter = params.getParameterNamesIterator();
|
||||
String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + ".";
|
||||
|
@ -220,10 +222,6 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
result.add(nxt.substring(prefix.length()), params.getParams(nxt));
|
||||
}
|
||||
}
|
||||
if(shardRequest)
|
||||
{
|
||||
result.add(ShardParams.IS_SHARD, "true");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -256,6 +254,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false);
|
||||
int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0);
|
||||
int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1);
|
||||
int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1);
|
||||
int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
|
||||
|
||||
String origQuery = params.get(SPELLCHECK_Q);
|
||||
if (origQuery == null) {
|
||||
|
@ -263,192 +263,30 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
if (origQuery == null) {
|
||||
origQuery = params.get(CommonParams.Q);
|
||||
}
|
||||
}
|
||||
|
||||
int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1);
|
||||
float min = 0.5f;
|
||||
StringDistance sd = null;
|
||||
int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
|
||||
SolrSpellChecker checker = getSpellChecker(rb.req.getParams());
|
||||
if (checker instanceof AbstractLuceneSpellChecker) {
|
||||
AbstractLuceneSpellChecker spellChecker = (AbstractLuceneSpellChecker) checker;
|
||||
min = spellChecker.getAccuracy();
|
||||
sd = spellChecker.getStringDistance();
|
||||
}
|
||||
if (sd == null)
|
||||
sd = new LevensteinDistance();
|
||||
|
||||
Collection<Token> tokens = null;
|
||||
try {
|
||||
tokens = getTokens(origQuery, checker.getQueryAnalyzer());
|
||||
} catch (IOException e) {
|
||||
LOG.error("Could not get tokens (this should never happen)", e);
|
||||
}
|
||||
|
||||
// original token -> corresponding Suggestion object (keep track of start,end)
|
||||
Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
|
||||
// original token string -> summed up frequency
|
||||
Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
|
||||
// original token string -> # of shards reporting it as misspelled
|
||||
Map<String, Integer> origVsShards = new HashMap<String, Integer>();
|
||||
// original token string -> set of alternatives
|
||||
// must preserve order because collation algorithm can only work in-order
|
||||
Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
|
||||
// alternative string -> corresponding SuggestWord object
|
||||
Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
|
||||
Map<String, SpellCheckCollation> collations = new HashMap<String, SpellCheckCollation>();
|
||||
}
|
||||
|
||||
int totalNumberShardResponses = 0;
|
||||
SpellCheckMergeData mergeData = new SpellCheckMergeData();
|
||||
for (ShardRequest sreq : rb.finished) {
|
||||
for (ShardResponse srsp : sreq.responses) {
|
||||
NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
|
||||
LOG.info(srsp.getShard() + " " + nl);
|
||||
if (nl != null) {
|
||||
totalNumberShardResponses++;
|
||||
SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
|
||||
for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
|
||||
origVsSuggestion.put(suggestion.getToken(), suggestion);
|
||||
HashSet<String> suggested = origVsSuggested.get(suggestion.getToken());
|
||||
if (suggested == null) {
|
||||
suggested = new HashSet<String>();
|
||||
origVsSuggested.put(suggestion.getToken(), suggested);
|
||||
}
|
||||
|
||||
// sum up original frequency
|
||||
int origFreq = 0;
|
||||
Integer o = origVsFreq.get(suggestion.getToken());
|
||||
if (o != null) origFreq += o;
|
||||
origFreq += suggestion.getOriginalFrequency();
|
||||
origVsFreq.put(suggestion.getToken(), origFreq);
|
||||
|
||||
//# shards reporting
|
||||
Integer origShards = origVsShards.get(suggestion.getToken());
|
||||
if(origShards==null) {
|
||||
origVsShards.put(suggestion.getToken(), 1);
|
||||
} else {
|
||||
origVsShards.put(suggestion.getToken(), ++origShards);
|
||||
}
|
||||
|
||||
// find best suggestions
|
||||
for (int i = 0; i < suggestion.getNumFound(); i++) {
|
||||
String alternative = suggestion.getAlternatives().get(i);
|
||||
suggested.add(alternative);
|
||||
SuggestWord sug = suggestedVsWord.get(alternative);
|
||||
if (sug == null) {
|
||||
sug = new SuggestWord();
|
||||
suggestedVsWord.put(alternative, sug);
|
||||
}
|
||||
sug.string = alternative;
|
||||
// alternative frequency is present only for extendedResults=true
|
||||
if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) {
|
||||
Integer freq = suggestion.getAlternativeFrequencies().get(i);
|
||||
if (freq != null) sug.freq += freq;
|
||||
}
|
||||
}
|
||||
}
|
||||
NamedList suggestions = (NamedList) nl.get("suggestions");
|
||||
if(suggestions != null) {
|
||||
List<Object> collationList = suggestions.getAll("collation");
|
||||
List<Object> collationRankList = suggestions.getAll("collationInternalRank");
|
||||
int i=0;
|
||||
if(collationList != null) {
|
||||
for(Object o : collationList)
|
||||
{
|
||||
if(o instanceof String)
|
||||
{
|
||||
SpellCheckCollation coll = new SpellCheckCollation();
|
||||
coll.setCollationQuery((String) o);
|
||||
if(collationRankList!= null && collationRankList.size()>0)
|
||||
{
|
||||
coll.setInternalRank((Integer) collationRankList.get(i));
|
||||
i++;
|
||||
}
|
||||
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
|
||||
if(priorColl != null)
|
||||
{
|
||||
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
|
||||
}
|
||||
collations.put(coll.getCollationQuery(), coll);
|
||||
} else
|
||||
{
|
||||
NamedList expandedCollation = (NamedList) o;
|
||||
SpellCheckCollation coll = new SpellCheckCollation();
|
||||
coll.setCollationQuery((String) expandedCollation.get("collationQuery"));
|
||||
coll.setHits((Integer) expandedCollation.get("hits"));
|
||||
if(maxCollationTries>0)
|
||||
{
|
||||
coll.setInternalRank((Integer) expandedCollation.get("collationInternalRank"));
|
||||
}
|
||||
coll.setMisspellingsAndCorrections((NamedList) expandedCollation.get("misspellingsAndCorrections"));
|
||||
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
|
||||
if(priorColl != null)
|
||||
{
|
||||
coll.setHits(coll.getHits() + priorColl.getHits());
|
||||
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
|
||||
}
|
||||
collations.put(coll.getCollationQuery(), coll);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
mergeData.totalNumberShardResponses++;
|
||||
collectShardSuggestions(nl, mergeData);
|
||||
collectShardCollations(mergeData, nl, maxCollationTries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// all shard responses have been collected
|
||||
// create token and get top suggestions
|
||||
SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand?
|
||||
for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet()) {
|
||||
String original = entry.getKey();
|
||||
|
||||
//Only use this suggestion if all shards reported it as misspelled.
|
||||
Integer numShards = origVsShards.get(original);
|
||||
if(numShards<totalNumberShardResponses) {
|
||||
continue;
|
||||
}
|
||||
|
||||
HashSet<String> suggested = entry.getValue();
|
||||
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
|
||||
for (String suggestion : suggested) {
|
||||
SuggestWord sug = suggestedVsWord.get(suggestion);
|
||||
sug.score = sd.getDistance(original, sug.string);
|
||||
if (sug.score < min) continue;
|
||||
sugQueue.insertWithOverflow(sug);
|
||||
if (sugQueue.size() == numSug) {
|
||||
// if queue full, maintain the minScore score
|
||||
min = sugQueue.top().score;
|
||||
}
|
||||
}
|
||||
|
||||
// create token
|
||||
SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
|
||||
Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
|
||||
|
||||
// get top 'count' suggestions out of 'sugQueue.size()' candidates
|
||||
SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
|
||||
// skip the first sugQueue.size() - count elements
|
||||
for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop();
|
||||
// now collect the top 'count' responses
|
||||
for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
|
||||
suggestions[k] = sugQueue.pop();
|
||||
}
|
||||
|
||||
if (extendedResults) {
|
||||
Integer o = origVsFreq.get(original);
|
||||
if (o != null) result.addFrequency(token, o);
|
||||
for (SuggestWord word : suggestions)
|
||||
result.add(token, word.string, word.freq);
|
||||
} else {
|
||||
List<String> words = new ArrayList<String>(sugQueue.size());
|
||||
for (SuggestWord word : suggestions) words.add(word.string);
|
||||
result.add(token, words);
|
||||
}
|
||||
}
|
||||
SolrSpellChecker checker = getSpellChecker(rb.req.getParams());
|
||||
SpellingResult result = checker.mergeSuggestions(mergeData, numSug, count, extendedResults);
|
||||
|
||||
NamedList response = new SimpleOrderedMap();
|
||||
NamedList suggestions = toNamedList(false, result, origQuery, extendedResults, collate);
|
||||
if (collate) {
|
||||
SpellCheckCollation[] sortedCollations = collations.values().toArray(new SpellCheckCollation[collations.size()]);
|
||||
SpellCheckCollation[] sortedCollations = mergeData.collations.values().toArray(new SpellCheckCollation[mergeData.collations.size()]);
|
||||
Arrays.sort(sortedCollations);
|
||||
int i = 0;
|
||||
while (i < maxCollations && i < sortedCollations.length) {
|
||||
|
@ -470,6 +308,101 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
response.add("suggestions", suggestions);
|
||||
rb.rsp.add("spellcheck", response);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void collectShardSuggestions(NamedList nl, SpellCheckMergeData mergeData) {
|
||||
SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
|
||||
for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
|
||||
mergeData.origVsSuggestion.put(suggestion.getToken(), suggestion);
|
||||
HashSet<String> suggested = mergeData.origVsSuggested.get(suggestion.getToken());
|
||||
if (suggested == null) {
|
||||
suggested = new HashSet<String>();
|
||||
mergeData.origVsSuggested.put(suggestion.getToken(), suggested);
|
||||
}
|
||||
|
||||
// sum up original frequency
|
||||
int origFreq = 0;
|
||||
Integer o = mergeData.origVsFreq.get(suggestion.getToken());
|
||||
if (o != null) origFreq += o;
|
||||
origFreq += suggestion.getOriginalFrequency();
|
||||
mergeData.origVsFreq.put(suggestion.getToken(), origFreq);
|
||||
|
||||
//# shards reporting
|
||||
Integer origShards = mergeData.origVsShards.get(suggestion.getToken());
|
||||
if(origShards==null) {
|
||||
mergeData.origVsShards.put(suggestion.getToken(), 1);
|
||||
} else {
|
||||
mergeData.origVsShards.put(suggestion.getToken(), ++origShards);
|
||||
}
|
||||
|
||||
// find best suggestions
|
||||
for (int i = 0; i < suggestion.getNumFound(); i++) {
|
||||
String alternative = suggestion.getAlternatives().get(i);
|
||||
suggested.add(alternative);
|
||||
SuggestWord sug = mergeData.suggestedVsWord.get(alternative);
|
||||
if (sug == null) {
|
||||
sug = new SuggestWord();
|
||||
mergeData.suggestedVsWord.put(alternative, sug);
|
||||
}
|
||||
sug.string = alternative;
|
||||
// alternative frequency is present only for extendedResults=true
|
||||
if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) {
|
||||
Integer freq = suggestion.getAlternativeFrequencies().get(i);
|
||||
if (freq != null) sug.freq += freq;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void collectShardCollations(SpellCheckMergeData mergeData, NamedList spellCheckResponse, int maxCollationTries) {
|
||||
Map<String, SpellCheckCollation> collations = mergeData.collations;
|
||||
NamedList suggestions = (NamedList) spellCheckResponse.get("suggestions");
|
||||
if(suggestions != null) {
|
||||
List<Object> collationList = suggestions.getAll("collation");
|
||||
List<Object> collationRankList = suggestions.getAll("collationInternalRank");
|
||||
int i=0;
|
||||
if(collationList != null) {
|
||||
for(Object o : collationList)
|
||||
{
|
||||
if(o instanceof String)
|
||||
{
|
||||
SpellCheckCollation coll = new SpellCheckCollation();
|
||||
coll.setCollationQuery((String) o);
|
||||
if(collationRankList!= null && collationRankList.size()>0)
|
||||
{
|
||||
coll.setInternalRank((Integer) collationRankList.get(i));
|
||||
i++;
|
||||
}
|
||||
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
|
||||
if(priorColl != null)
|
||||
{
|
||||
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
|
||||
}
|
||||
collations.put(coll.getCollationQuery(), coll);
|
||||
} else
|
||||
{
|
||||
NamedList expandedCollation = (NamedList) o;
|
||||
SpellCheckCollation coll = new SpellCheckCollation();
|
||||
coll.setCollationQuery((String) expandedCollation.get("collationQuery"));
|
||||
coll.setHits((Integer) expandedCollation.get("hits"));
|
||||
if(maxCollationTries>0)
|
||||
{
|
||||
coll.setInternalRank((Integer) expandedCollation.get("collationInternalRank"));
|
||||
}
|
||||
coll.setMisspellingsAndCorrections((NamedList) expandedCollation.get("misspellingsAndCorrections"));
|
||||
SpellCheckCollation priorColl = collations.get(coll.getCollationQuery());
|
||||
if(priorColl != null)
|
||||
{
|
||||
coll.setHits(coll.getHits() + priorColl.getHits());
|
||||
coll.setInternalRank(Math.max(coll.getInternalRank(),priorColl.getInternalRank()));
|
||||
}
|
||||
collations.put(coll.getCollationQuery(), coll);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
||||
Collection<Token> result = new ArrayList<Token>();
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
package org.apache.solr.handler.component;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.search.spell.SuggestWord;
|
||||
import org.apache.solr.client.solrj.response.SpellCheckResponse;
|
||||
import org.apache.solr.spelling.SpellCheckCollation;
|
||||
|
||||
public class SpellCheckMergeData {
|
||||
//original token -> corresponding Suggestion object (keep track of start,end)
|
||||
public Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
|
||||
// original token string -> summed up frequency
|
||||
public Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
|
||||
// original token string -> # of shards reporting it as misspelled
|
||||
public Map<String, Integer> origVsShards = new HashMap<String, Integer>();
|
||||
// original token string -> set of alternatives
|
||||
// must preserve order because collation algorithm can only work in-order
|
||||
public Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
|
||||
// alternative string -> corresponding SuggestWord object
|
||||
public Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
|
||||
public Map<String, SpellCheckCollation> collations = new HashMap<String, SpellCheckCollation>();
|
||||
public int totalNumberShardResponses = 0;
|
||||
}
|
|
@ -141,13 +141,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
|
||||
@Override
|
||||
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
|
||||
boolean shardRequest = false;
|
||||
SolrParams params = options.customParams;
|
||||
if(params!=null)
|
||||
{
|
||||
shardRequest = "true".equals(params.get(ShardParams.IS_SHARD));
|
||||
}
|
||||
SpellingResult result = new SpellingResult(options.tokens);
|
||||
SpellingResult result = new SpellingResult(options.tokens);
|
||||
IndexReader reader = determineReader(options.reader);
|
||||
Term term = field != null ? new Term(field, "") : null;
|
||||
float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy;
|
||||
|
@ -176,7 +170,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
term = new Term(field, suggestions[i]);
|
||||
result.add(token, suggestions[i], reader.docFreq(term));
|
||||
}
|
||||
} else if(shardRequest) {
|
||||
} else {
|
||||
List<String> suggList = Collections.emptyList();
|
||||
result.add(token, suggList);
|
||||
}
|
||||
|
@ -187,7 +181,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
suggList = suggList.subList(0, options.count);
|
||||
}
|
||||
result.add(token, suggList);
|
||||
} else if(shardRequest) {
|
||||
} else {
|
||||
List<String> suggList = Collections.emptyList();
|
||||
result.add(token, suggList);
|
||||
}
|
||||
|
@ -222,6 +216,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
/*
|
||||
* @return the Accuracy used for the Spellchecker
|
||||
* */
|
||||
@Override
|
||||
public float getAccuracy() {
|
||||
return accuracy;
|
||||
}
|
||||
|
@ -257,6 +252,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
|||
return sourceLocation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringDistance getStringDistance() {
|
||||
return sd;
|
||||
}
|
||||
|
|
|
@ -18,7 +18,9 @@ package org.apache.solr.spelling;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
|
@ -29,6 +31,8 @@ import org.apache.lucene.search.spell.SuggestMode;
|
|||
import org.apache.lucene.search.spell.SuggestWord;
|
||||
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
||||
import org.apache.solr.common.params.ShardParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.SpellingParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
|
@ -174,19 +178,41 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
|
|||
public SpellingResult getSuggestions(SpellingOptions options)
|
||||
throws IOException {
|
||||
LOG.debug("getSuggestions: " + options.tokens);
|
||||
|
||||
|
||||
SpellingResult result = new SpellingResult();
|
||||
float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
|
||||
SuggestMode mode = options.onlyMorePopular ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
|
||||
for (Token token : options.tokens) {
|
||||
Term term = new Term(field, token.toString());
|
||||
SuggestWord[] suggestions = checker.suggestSimilar(term,
|
||||
options.count, options.reader, mode, accuracy);
|
||||
result.addFrequency(token, options.reader.docFreq(term));
|
||||
for (SuggestWord suggestion : suggestions) {
|
||||
result.add(token, suggestion.string, suggestion.freq);
|
||||
options.count, options.reader, mode, accuracy);
|
||||
|
||||
int docFreq = 0;
|
||||
if(options.extendedResults || suggestions.length==0) {
|
||||
docFreq = options.reader.docFreq(term);
|
||||
}
|
||||
|
||||
if(options.extendedResults) {
|
||||
result.addFrequency(token, docFreq);
|
||||
}
|
||||
if(suggestions.length==0 && docFreq==0) {
|
||||
List<String> empty = Collections.emptyList();
|
||||
result.add(token, empty);
|
||||
} else {
|
||||
for (SuggestWord suggestion : suggestions) {
|
||||
result.add(token, suggestion.string, suggestion.freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getAccuracy() {
|
||||
return checker.getAccuracy();
|
||||
}
|
||||
@Override
|
||||
public StringDistance getStringDistance() {
|
||||
return checker.getDistance();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -59,6 +59,9 @@ public class PossibilityIterator implements Iterator<RankedSpellPossibility> {
|
|||
public PossibilityIterator(Map<Token, LinkedHashMap<String, Integer>> suggestions, int maximumRequiredSuggestions, int maxEvaluations) {
|
||||
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
|
||||
Token token = entry.getKey();
|
||||
if(entry.getValue().size()==0) {
|
||||
continue;
|
||||
}
|
||||
List<SpellCheckCorrection> possibleCorrections = new ArrayList<SpellCheckCorrection>();
|
||||
for (Map.Entry<String, Integer> entry1 : entry.getValue().entrySet()) {
|
||||
SpellCheckCorrection correction = new SpellCheckCorrection();
|
||||
|
|
|
@ -17,13 +17,24 @@ package org.apache.solr.spelling;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.search.spell.LevensteinDistance;
|
||||
import org.apache.lucene.search.spell.StringDistance;
|
||||
import org.apache.lucene.search.spell.SuggestWord;
|
||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
||||
import org.apache.solr.client.solrj.response.SpellCheckResponse;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.handler.component.SpellCheckMergeData;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -64,6 +75,74 @@ public abstract class SolrSpellChecker {
|
|||
}
|
||||
return name;
|
||||
}
|
||||
/**
|
||||
* Integrate spelling suggestions from the various shards in a distributed environment.
|
||||
*
|
||||
* @param mergeData
|
||||
* @param numSug
|
||||
* @param count
|
||||
* @param extendedResults
|
||||
* @return
|
||||
*/
|
||||
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
|
||||
float min = 0.5f;
|
||||
try {
|
||||
min = getAccuracy();
|
||||
} catch(UnsupportedOperationException uoe) {
|
||||
//just use .5 as a default
|
||||
}
|
||||
|
||||
StringDistance sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
|
||||
|
||||
SpellingResult result = new SpellingResult();
|
||||
for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
|
||||
String original = entry.getKey();
|
||||
|
||||
//Only use this suggestion if all shards reported it as misspelled.
|
||||
Integer numShards = mergeData.origVsShards.get(original);
|
||||
if(numShards<mergeData.totalNumberShardResponses) {
|
||||
continue;
|
||||
}
|
||||
|
||||
HashSet<String> suggested = entry.getValue();
|
||||
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
|
||||
for (String suggestion : suggested) {
|
||||
SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
|
||||
sug.score = sd.getDistance(original, sug.string);
|
||||
if (sug.score < min) continue;
|
||||
sugQueue.insertWithOverflow(sug);
|
||||
if (sugQueue.size() == numSug) {
|
||||
// if queue full, maintain the minScore score
|
||||
min = sugQueue.top().score;
|
||||
}
|
||||
}
|
||||
|
||||
// create token
|
||||
SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
|
||||
Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
|
||||
|
||||
// get top 'count' suggestions out of 'sugQueue.size()' candidates
|
||||
SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
|
||||
// skip the first sugQueue.size() - count elements
|
||||
for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop();
|
||||
// now collect the top 'count' responses
|
||||
for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
|
||||
suggestions[k] = sugQueue.pop();
|
||||
}
|
||||
|
||||
if (extendedResults) {
|
||||
Integer o = mergeData.origVsFreq.get(original);
|
||||
if (o != null) result.addFrequency(token, o);
|
||||
for (SuggestWord word : suggestions)
|
||||
result.add(token, word.string, word.freq);
|
||||
} else {
|
||||
List<String> words = new ArrayList<String>(sugQueue.size());
|
||||
for (SuggestWord word : suggestions) words.add(word.string);
|
||||
result.add(token, words);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public Analyzer getQueryAnalyzer() {
|
||||
return analyzer;
|
||||
|
@ -84,6 +163,23 @@ public abstract class SolrSpellChecker {
|
|||
* (re)Builds the spelling index. May be a NOOP if the implementation doesn't require building, or can't be rebuilt.
|
||||
*/
|
||||
public abstract void build(SolrCore core, SolrIndexSearcher searcher);
|
||||
|
||||
/**
|
||||
* Get the value of {@link SpellingParams.SPELLCHECK_ACCURACY} if supported.
|
||||
* Otherwise throws UnsupportedOperationException.
|
||||
* @return
|
||||
*/
|
||||
protected float getAccuracy() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the distance implementation used by this spellchecker, or NULL if not applicable.
|
||||
* @return
|
||||
*/
|
||||
protected StringDistance getStringDistance() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
|
|
@ -323,6 +323,12 @@
|
|||
<str name="spellcheckIndexDir">spellchecker1</str>
|
||||
<str name="buildOnCommit">false</str>
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="name">direct</str>
|
||||
<str name="classname">DirectSolrSpellChecker</str>
|
||||
<str name="field">lowerfilt</str>
|
||||
<int name="minQueryLength">3</int>
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="name">multipleFields</str>
|
||||
<str name="field">lowerfilt1and2</str>
|
||||
|
@ -397,6 +403,17 @@
|
|||
<arr name="last-components">
|
||||
<str>spellcheck</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
<requestHandler name="spellCheckCompRH_Direct" class="org.apache.solr.handler.component.SearchHandler">
|
||||
<lst name="defaults">
|
||||
<str name="spellcheck.dictionary">direct</str>
|
||||
<str name="spellcheck.onlyMorePopular">false</str>
|
||||
<str name="spellcheck.extendedResults">false</str>
|
||||
<str name="spellcheck.count">1</str>
|
||||
</lst>
|
||||
<arr name="last-components">
|
||||
<str>spellcheck</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
<requestHandler name="spellCheckCompRH1" class="org.apache.solr.handler.component.SearchHandler">
|
||||
<lst name="defaults">
|
||||
|
|
|
@ -30,10 +30,13 @@ import org.apache.solr.common.params.ModifiableSolrParams;
|
|||
*/
|
||||
public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTestCase {
|
||||
|
||||
private String requestHandlerName;
|
||||
|
||||
public DistributedSpellCheckComponentTest()
|
||||
{
|
||||
//fixShardCount=true;
|
||||
//shardCount=2;
|
||||
//stress=0;
|
||||
}
|
||||
|
||||
private String saveProp;
|
||||
|
@ -41,7 +44,8 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
|
|||
public void setUp() throws Exception {
|
||||
// this test requires FSDir
|
||||
saveProp = System.getProperty("solr.directoryFactory");
|
||||
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
||||
System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory");
|
||||
requestHandlerName = random.nextBoolean() ? "spellCheckCompRH" : "spellCheckCompRH_Direct";
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
|
@ -104,15 +108,17 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
|
|||
handle.put("maxScore", SKIPVAL);
|
||||
// we care only about the spellcheck results
|
||||
handle.put("response", SKIP);
|
||||
|
||||
q("q", "*:*", SpellCheckComponent.SPELLCHECK_BUILD, "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH");
|
||||
|
||||
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH");
|
||||
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true");
|
||||
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4");
|
||||
query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true");
|
||||
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName);
|
||||
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","toyata", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true");
|
||||
query("q", "*:*", "fl", "id,lowerfilt", "spellcheck.q","bluo", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4");
|
||||
query("q", "The quick reb fox jumped over the lazy brown dogs", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "4", SpellCheckComponent.SPELLCHECK_COLLATE, "true");
|
||||
|
||||
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true");
|
||||
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
|
||||
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "0", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
|
||||
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true");
|
||||
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
|
||||
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "0", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -175,7 +175,7 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 {
|
|||
result = checker.getSuggestions(spellOpts);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(spellOpts.tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
assertTrue("suggestions size should be 0", suggestions.size()==0);
|
||||
searcher.decref();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -140,7 +140,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
|
|||
result = checker.getSuggestions(spellOpts);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(spellOpts.tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
assertTrue("suggestions size should be 0", suggestions.size()==0);
|
||||
|
||||
//test something that is spelled correctly
|
||||
spellOpts.tokens = queryConverter.convert("document");
|
||||
|
@ -215,7 +215,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
|
|||
result = checker.getSuggestions(spellOpts);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(spellOpts.tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
assertTrue("suggestions size should be 0", suggestions.size()==0);
|
||||
|
||||
spellOpts.tokens = queryConverter.convert("document");
|
||||
result = checker.getSuggestions(spellOpts);
|
||||
|
@ -328,7 +328,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
|
|||
result = checker.getSuggestions(spellOpts);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(spellOpts.tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
assertTrue("suggestions size should be 0", suggestions.size()==0);
|
||||
|
||||
spellOpts.tokens = queryConverter.convert("Caroline");
|
||||
result = checker.getSuggestions(spellOpts);
|
||||
|
|
Loading…
Reference in New Issue